From adf8abd346cbc7c11b075f47b26b2542f6d9cfba Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Fri, 30 Aug 2024 13:45:53 -0400 Subject: [PATCH] Apply GPU markers to ibeta_inv_ab Remove NVRTC workaround Apply GPU markers to ibeta_inverse Apply GPU markers to t_dist_inv Fix warning suppression Add dispatch function and remove workaround Move disabling block Make binomial GPU enabled Add SYCL testing of ibeta Add SYCL testing of ibeta_inv Add SYCL testing of ibeta_inv_ab Add SYCL testing of full beta suite Add makers to fwd decls Add special forward decls for NVRTC Add betac nvrtc testing Add betac CUDA testing Add ibeta CUDA testing Add ibeta NVRTC testing Add ibetac NVRTC testing Add ibeta_derviative testing to nvrtc Add ibeta_derivative CUDA testing Add cbrt policy overload for NVRTC Fix NVRTC definition of BOOST_MATH_IF_CONSTEXPR Add ibeta_inv and ibetac_inv NVRTC testing Fix make pair helper on device Add CUDA testing of ibeta_inv* and ibetac_inv* Move location so that it also works on NVRTC Add NVRTC testing of ibeta_inv* and ibetac_inv* Fixup test sets since they ignore the policy Make the beta dist GPU compatible Add beta dist SYCL testing Add beta dist CUDA testing Add beta dist NVRTC testing --- include/boost/math/distributions/beta.hpp | 87 +++---- .../boost/math/policies/error_handling.hpp | 31 +-- include/boost/math/special_functions/beta.hpp | 17 +- .../boost/math/special_functions/binomial.hpp | 17 +- include/boost/math/special_functions/cbrt.hpp | 28 ++- .../special_functions/detail/ibeta_inv_ab.hpp | 42 ++-- .../detail/ibeta_inverse.hpp | 90 +++---- .../detail/t_distribution_inv.hpp | 31 +-- .../boost/math/special_functions/gamma.hpp | 77 +++--- .../boost/math/special_functions/math_fwd.hpp | 58 +++-- include/boost/math/tools/config.hpp | 2 +- test/cuda_jamfile | 25 ++ test/nvrtc_jamfile | 27 +++ test/sycl_jamfile | 5 + test/test_bessel_i.cpp | 2 +- test/test_bessel_j.cpp | 2 +- test/test_bessel_k.cpp | 2 +- test/test_bessel_y.cpp | 2 +- test/test_beta.cpp | 12 +- test/test_beta.hpp | 6 +- test/test_beta_dist.cpp | 10 +- test/test_beta_dist_cdf_double.cu | 109 +++++++++ test/test_beta_dist_cdf_float.cu | 109 +++++++++ test/test_beta_dist_cdf_nvrtc_double.cpp | 191 +++++++++++++++ test/test_beta_dist_cdf_nvrtc_float.cpp | 191 +++++++++++++++ test/test_beta_dist_pdf_double.cu | 109 +++++++++ test/test_beta_dist_pdf_float.cu | 109 +++++++++ test/test_beta_dist_pdf_nvrtc_double.cpp | 191 +++++++++++++++ test/test_beta_dist_pdf_nvrtc_float.cpp | 191 +++++++++++++++ test/test_beta_dist_quan_double.cu | 109 +++++++++ test/test_beta_dist_quan_float.cu | 109 +++++++++ test/test_beta_dist_quan_nvrtc_double.cpp | 191 +++++++++++++++ test/test_beta_dist_quan_nvrtc_float.cpp | 191 +++++++++++++++ test/test_betac_double.cu | 146 ++++++++++++ test/test_betac_float.cu | 146 ++++++++++++ test/test_betac_nvrtc_double.cpp | 196 ++++++++++++++++ test/test_betac_nvrtc_float.cpp | 196 ++++++++++++++++ test/test_erf.cpp | 2 +- test/test_holtsmark.cpp | 2 +- test/test_ibeta.cpp | 11 + test/test_ibeta.hpp | 3 +- test/test_ibeta_derivative.cpp | 2 +- test/test_ibeta_derivative_double.cu | 149 ++++++++++++ test/test_ibeta_derivative_float.cu | 149 ++++++++++++ test/test_ibeta_derivative_nvrtc_double.cpp | 207 ++++++++++++++++ test/test_ibeta_derivative_nvrtc_float.cpp | 207 ++++++++++++++++ test/test_ibeta_double.cu | 149 ++++++++++++ test/test_ibeta_float.cu | 149 ++++++++++++ test/test_ibeta_inv.cpp | 11 + test/test_ibeta_inv.hpp | 5 +- test/test_ibeta_inv_ab.cpp | 11 + test/test_ibeta_inv_ab.hpp | 3 +- test/test_ibeta_inv_double.cu | 149 ++++++++++++ test/test_ibeta_inv_float.cu | 149 ++++++++++++ test/test_ibeta_inv_nvrtc_double.cpp | 207 ++++++++++++++++ test/test_ibeta_inv_nvrtc_float.cpp | 207 ++++++++++++++++ test/test_ibeta_inva_double.cu | 149 ++++++++++++ test/test_ibeta_inva_float.cu | 149 ++++++++++++ test/test_ibeta_inva_nvrtc_double.cpp | 220 ++++++++++++++++++ test/test_ibeta_inva_nvrtc_float.cpp | 220 ++++++++++++++++++ test/test_ibeta_invb_double.cu | 149 ++++++++++++ test/test_ibeta_invb_float.cu | 149 ++++++++++++ test/test_ibeta_invb_nvrtc_double.cpp | 220 ++++++++++++++++++ test/test_ibeta_invb_nvrtc_float.cpp | 220 ++++++++++++++++++ test/test_ibeta_nvrtc_double.cpp | 207 ++++++++++++++++ test/test_ibeta_nvrtc_float.cpp | 207 ++++++++++++++++ test/test_ibetac_inv_double.cu | 149 ++++++++++++ test/test_ibetac_inv_float.cu | 149 ++++++++++++ test/test_ibetac_inv_nvrtc_double.cpp | 207 ++++++++++++++++ test/test_ibetac_inv_nvrtc_float.cpp | 207 ++++++++++++++++ test/test_ibetac_inva_double.cu | 149 ++++++++++++ test/test_ibetac_inva_float.cu | 149 ++++++++++++ test/test_ibetac_inva_nvrtc_double.cpp | 220 ++++++++++++++++++ test/test_ibetac_inva_nvrtc_float.cpp | 220 ++++++++++++++++++ test/test_ibetac_invb_double.cu | 149 ++++++++++++ test/test_ibetac_invb_float.cu | 149 ++++++++++++ test/test_ibetac_invb_nvrtc_double.cpp | 220 ++++++++++++++++++ test/test_ibetac_invb_nvrtc_float.cpp | 220 ++++++++++++++++++ test/test_ibetac_nvrtc_double.cpp | 207 ++++++++++++++++ test/test_ibetac_nvrtc_float.cpp | 207 ++++++++++++++++ test/test_igamma.cpp | 2 +- test/test_igamma_inv.cpp | 2 +- test/test_igamma_inva.cpp | 2 +- test/test_landau.cpp | 2 +- test/test_mapairy.cpp | 2 +- 85 files changed, 9093 insertions(+), 238 deletions(-) create mode 100644 test/test_beta_dist_cdf_double.cu create mode 100644 test/test_beta_dist_cdf_float.cu create mode 100644 test/test_beta_dist_cdf_nvrtc_double.cpp create mode 100644 test/test_beta_dist_cdf_nvrtc_float.cpp create mode 100644 test/test_beta_dist_pdf_double.cu create mode 100644 test/test_beta_dist_pdf_float.cu create mode 100644 test/test_beta_dist_pdf_nvrtc_double.cpp create mode 100644 test/test_beta_dist_pdf_nvrtc_float.cpp create mode 100644 test/test_beta_dist_quan_double.cu create mode 100644 test/test_beta_dist_quan_float.cu create mode 100644 test/test_beta_dist_quan_nvrtc_double.cpp create mode 100644 test/test_beta_dist_quan_nvrtc_float.cpp create mode 100644 test/test_betac_double.cu create mode 100644 test/test_betac_float.cu create mode 100644 test/test_betac_nvrtc_double.cpp create mode 100644 test/test_betac_nvrtc_float.cpp create mode 100644 test/test_ibeta_derivative_double.cu create mode 100644 test/test_ibeta_derivative_float.cu create mode 100644 test/test_ibeta_derivative_nvrtc_double.cpp create mode 100644 test/test_ibeta_derivative_nvrtc_float.cpp create mode 100644 test/test_ibeta_double.cu create mode 100644 test/test_ibeta_float.cu create mode 100644 test/test_ibeta_inv_double.cu create mode 100644 test/test_ibeta_inv_float.cu create mode 100644 test/test_ibeta_inv_nvrtc_double.cpp create mode 100644 test/test_ibeta_inv_nvrtc_float.cpp create mode 100644 test/test_ibeta_inva_double.cu create mode 100644 test/test_ibeta_inva_float.cu create mode 100644 test/test_ibeta_inva_nvrtc_double.cpp create mode 100644 test/test_ibeta_inva_nvrtc_float.cpp create mode 100644 test/test_ibeta_invb_double.cu create mode 100644 test/test_ibeta_invb_float.cu create mode 100644 test/test_ibeta_invb_nvrtc_double.cpp create mode 100644 test/test_ibeta_invb_nvrtc_float.cpp create mode 100644 test/test_ibeta_nvrtc_double.cpp create mode 100644 test/test_ibeta_nvrtc_float.cpp create mode 100644 test/test_ibetac_inv_double.cu create mode 100644 test/test_ibetac_inv_float.cu create mode 100644 test/test_ibetac_inv_nvrtc_double.cpp create mode 100644 test/test_ibetac_inv_nvrtc_float.cpp create mode 100644 test/test_ibetac_inva_double.cu create mode 100644 test/test_ibetac_inva_float.cu create mode 100644 test/test_ibetac_inva_nvrtc_double.cpp create mode 100644 test/test_ibetac_inva_nvrtc_float.cpp create mode 100644 test/test_ibetac_invb_double.cu create mode 100644 test/test_ibetac_invb_float.cu create mode 100644 test/test_ibetac_invb_nvrtc_double.cpp create mode 100644 test/test_ibetac_invb_nvrtc_float.cpp create mode 100644 test/test_ibetac_nvrtc_double.cpp create mode 100644 test/test_ibetac_nvrtc_float.cpp diff --git a/include/boost/math/distributions/beta.hpp b/include/boost/math/distributions/beta.hpp index 6c17ffa1a2..fef991a870 100644 --- a/include/boost/math/distributions/beta.hpp +++ b/include/boost/math/distributions/beta.hpp @@ -25,12 +25,15 @@ #ifndef BOOST_MATH_DIST_BETA_HPP #define BOOST_MATH_DIST_BETA_HPP +#include +#include #include #include // for beta. #include // complements. #include // error checks #include // isnan. #include // for root finding. +#include #if defined (BOOST_MSVC) # pragma warning(push) @@ -38,8 +41,6 @@ // in domain_error_imp in error_handling #endif -#include - namespace boost { namespace math @@ -48,7 +49,7 @@ namespace boost { // Common error checking routines for beta distribution functions: template - inline bool check_alpha(const char* function, const RealType& alpha, RealType* result, const Policy& pol) + BOOST_MATH_GPU_ENABLED inline bool check_alpha(const char* function, const RealType& alpha, RealType* result, const Policy& pol) { if(!(boost::math::isfinite)(alpha) || (alpha <= 0)) { @@ -61,7 +62,7 @@ namespace boost } // bool check_alpha template - inline bool check_beta(const char* function, const RealType& beta, RealType* result, const Policy& pol) + BOOST_MATH_GPU_ENABLED inline bool check_beta(const char* function, const RealType& beta, RealType* result, const Policy& pol) { if(!(boost::math::isfinite)(beta) || (beta <= 0)) { @@ -74,7 +75,7 @@ namespace boost } // bool check_beta template - inline bool check_prob(const char* function, const RealType& p, RealType* result, const Policy& pol) + BOOST_MATH_GPU_ENABLED inline bool check_prob(const char* function, const RealType& p, RealType* result, const Policy& pol) { if((p < 0) || (p > 1) || !(boost::math::isfinite)(p)) { @@ -87,7 +88,7 @@ namespace boost } // bool check_prob template - inline bool check_x(const char* function, const RealType& x, RealType* result, const Policy& pol) + BOOST_MATH_GPU_ENABLED inline bool check_x(const char* function, const RealType& x, RealType* result, const Policy& pol) { if(!(boost::math::isfinite)(x) || (x < 0) || (x > 1)) { @@ -100,28 +101,28 @@ namespace boost } // bool check_x template - inline bool check_dist(const char* function, const RealType& alpha, const RealType& beta, RealType* result, const Policy& pol) + BOOST_MATH_GPU_ENABLED inline bool check_dist(const char* function, const RealType& alpha, const RealType& beta, RealType* result, const Policy& pol) { // Check both alpha and beta. return check_alpha(function, alpha, result, pol) && check_beta(function, beta, result, pol); } // bool check_dist template - inline bool check_dist_and_x(const char* function, const RealType& alpha, const RealType& beta, RealType x, RealType* result, const Policy& pol) + BOOST_MATH_GPU_ENABLED inline bool check_dist_and_x(const char* function, const RealType& alpha, const RealType& beta, RealType x, RealType* result, const Policy& pol) { return check_dist(function, alpha, beta, result, pol) && beta_detail::check_x(function, x, result, pol); } // bool check_dist_and_x template - inline bool check_dist_and_prob(const char* function, const RealType& alpha, const RealType& beta, RealType p, RealType* result, const Policy& pol) + BOOST_MATH_GPU_ENABLED inline bool check_dist_and_prob(const char* function, const RealType& alpha, const RealType& beta, RealType p, RealType* result, const Policy& pol) { return check_dist(function, alpha, beta, result, pol) && check_prob(function, p, result, pol); } // bool check_dist_and_prob template - inline bool check_mean(const char* function, const RealType& mean, RealType* result, const Policy& pol) + BOOST_MATH_GPU_ENABLED inline bool check_mean(const char* function, const RealType& mean, RealType* result, const Policy& pol) { if(!(boost::math::isfinite)(mean) || (mean <= 0)) { @@ -133,7 +134,7 @@ namespace boost return true; } // bool check_mean template - inline bool check_variance(const char* function, const RealType& variance, RealType* result, const Policy& pol) + BOOST_MATH_GPU_ENABLED inline bool check_variance(const char* function, const RealType& variance, RealType* result, const Policy& pol) { if(!(boost::math::isfinite)(variance) || (variance <= 0)) { @@ -157,7 +158,7 @@ namespace boost typedef RealType value_type; typedef Policy policy_type; - beta_distribution(RealType l_alpha = 1, RealType l_beta = 1) : m_alpha(l_alpha), m_beta(l_beta) + BOOST_MATH_GPU_ENABLED beta_distribution(RealType l_alpha = 1, RealType l_beta = 1) : m_alpha(l_alpha), m_beta(l_beta) { RealType result; beta_detail::check_dist( @@ -167,11 +168,11 @@ namespace boost &result, Policy()); } // beta_distribution constructor. // Accessor functions: - RealType alpha() const + BOOST_MATH_GPU_ENABLED RealType alpha() const { return m_alpha; } - RealType beta() const + BOOST_MATH_GPU_ENABLED RealType beta() const { // . return m_beta; } @@ -183,11 +184,11 @@ namespace boost // http://www.itl.nist.gov/div898/handbook/eda/section3/eda366h.htm // http://www.epi.ucdavis.edu/diagnostictests/betabuster.html - static RealType find_alpha( + BOOST_MATH_GPU_ENABLED static RealType find_alpha( RealType mean, // Expected value of mean. RealType variance) // Expected value of variance. { - static const char* function = "boost::math::beta_distribution<%1%>::find_alpha"; + constexpr auto function = "boost::math::beta_distribution<%1%>::find_alpha"; RealType result = 0; // of error checks. if(false == ( @@ -201,11 +202,11 @@ namespace boost return mean * (( (mean * (1 - mean)) / variance)- 1); } // RealType find_alpha - static RealType find_beta( + BOOST_MATH_GPU_ENABLED static RealType find_beta( RealType mean, // Expected value of mean. RealType variance) // Expected value of variance. { - static const char* function = "boost::math::beta_distribution<%1%>::find_beta"; + constexpr auto function = "boost::math::beta_distribution<%1%>::find_beta"; RealType result = 0; // of error checks. if(false == ( @@ -223,12 +224,12 @@ namespace boost // Estimate alpha & beta from either alpha or beta, and x and probability. // Uses for these parameter estimators are unclear. - static RealType find_alpha( + BOOST_MATH_GPU_ENABLED static RealType find_alpha( RealType beta, // from beta. RealType x, // x. RealType probability) // cdf { - static const char* function = "boost::math::beta_distribution<%1%>::find_alpha"; + constexpr auto function = "boost::math::beta_distribution<%1%>::find_alpha"; RealType result = 0; // of error checks. if(false == ( @@ -245,13 +246,13 @@ namespace boost return static_cast(ibeta_inva(beta, x, probability, Policy())); } // RealType find_alpha(beta, a, probability) - static RealType find_beta( + BOOST_MATH_GPU_ENABLED static RealType find_beta( // ibeta_invb(T b, T x, T p); (alpha, x, cdf,) RealType alpha, // alpha. RealType x, // probability x. RealType probability) // probability cdf. { - static const char* function = "boost::math::beta_distribution<%1%>::find_beta"; + constexpr auto function = "boost::math::beta_distribution<%1%>::find_beta"; RealType result = 0; // of error checks. if(false == ( @@ -281,27 +282,27 @@ namespace boost #endif template - inline const std::pair range(const beta_distribution& /* dist */) + BOOST_MATH_GPU_ENABLED inline const boost::math::pair range(const beta_distribution& /* dist */) { // Range of permissible values for random variable x. using boost::math::tools::max_value; - return std::pair(static_cast(0), static_cast(1)); + return boost::math::pair(static_cast(0), static_cast(1)); } template - inline const std::pair support(const beta_distribution& /* dist */) + BOOST_MATH_GPU_ENABLED inline const boost::math::pair support(const beta_distribution& /* dist */) { // Range of supported values for random variable x. // This is range where cdf rises from 0 to 1, and outside it, the pdf is zero. - return std::pair(static_cast(0), static_cast(1)); + return boost::math::pair(static_cast(0), static_cast(1)); } template - inline RealType mean(const beta_distribution& dist) + BOOST_MATH_GPU_ENABLED inline RealType mean(const beta_distribution& dist) { // Mean of beta distribution = np. return dist.alpha() / (dist.alpha() + dist.beta()); } // mean template - inline RealType variance(const beta_distribution& dist) + BOOST_MATH_GPU_ENABLED inline RealType variance(const beta_distribution& dist) { // Variance of beta distribution = np(1-p). RealType a = dist.alpha(); RealType b = dist.beta(); @@ -309,9 +310,9 @@ namespace boost } // variance template - inline RealType mode(const beta_distribution& dist) + BOOST_MATH_GPU_ENABLED inline RealType mode(const beta_distribution& dist) { - static const char* function = "boost::math::mode(beta_distribution<%1%> const&)"; + constexpr auto function = "boost::math::mode(beta_distribution<%1%> const&)"; RealType result; if ((dist.alpha() <= 1)) @@ -343,7 +344,7 @@ namespace boost //But WILL be provided by the derived accessor as quantile(0.5). template - inline RealType skewness(const beta_distribution& dist) + BOOST_MATH_GPU_ENABLED inline RealType skewness(const beta_distribution& dist) { BOOST_MATH_STD_USING // ADL of std functions. RealType a = dist.alpha(); @@ -352,7 +353,7 @@ namespace boost } // skewness template - inline RealType kurtosis_excess(const beta_distribution& dist) + BOOST_MATH_GPU_ENABLED inline RealType kurtosis_excess(const beta_distribution& dist) { RealType a = dist.alpha(); RealType b = dist.beta(); @@ -363,17 +364,17 @@ namespace boost } // kurtosis_excess template - inline RealType kurtosis(const beta_distribution& dist) + BOOST_MATH_GPU_ENABLED inline RealType kurtosis(const beta_distribution& dist) { return 3 + kurtosis_excess(dist); } // kurtosis template - inline RealType pdf(const beta_distribution& dist, const RealType& x) + BOOST_MATH_GPU_ENABLED inline RealType pdf(const beta_distribution& dist, const RealType& x) { // Probability Density/Mass Function. BOOST_FPU_EXCEPTION_GUARD - static const char* function = "boost::math::pdf(beta_distribution<%1%> const&, %1%)"; + constexpr auto function = "boost::math::pdf(beta_distribution<%1%> const&, %1%)"; BOOST_MATH_STD_USING // for ADL of std functions @@ -428,11 +429,11 @@ namespace boost } // pdf template - inline RealType cdf(const beta_distribution& dist, const RealType& x) + BOOST_MATH_GPU_ENABLED inline RealType cdf(const beta_distribution& dist, const RealType& x) { // Cumulative Distribution Function beta. BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::cdf(beta_distribution<%1%> const&, %1%)"; + constexpr auto function = "boost::math::cdf(beta_distribution<%1%> const&, %1%)"; RealType a = dist.alpha(); RealType b = dist.beta(); @@ -459,12 +460,12 @@ namespace boost } // beta cdf template - inline RealType cdf(const complemented2_type, RealType>& c) + BOOST_MATH_GPU_ENABLED inline RealType cdf(const complemented2_type, RealType>& c) { // Complemented Cumulative Distribution Function beta. BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::cdf(beta_distribution<%1%> const&, %1%)"; + constexpr auto function = "boost::math::cdf(beta_distribution<%1%> const&, %1%)"; RealType const& x = c.param; beta_distribution const& dist = c.dist; @@ -495,7 +496,7 @@ namespace boost } // beta cdf template - inline RealType quantile(const beta_distribution& dist, const RealType& p) + BOOST_MATH_GPU_ENABLED inline RealType quantile(const beta_distribution& dist, const RealType& p) { // Quantile or Percent Point beta function or // Inverse Cumulative probability distribution function CDF. // Return x (0 <= x <= 1), @@ -505,7 +506,7 @@ namespace boost // will be less than or equal to that value // is whatever probability you supplied as an argument. - static const char* function = "boost::math::quantile(beta_distribution<%1%> const&, %1%)"; + constexpr auto function = "boost::math::quantile(beta_distribution<%1%> const&, %1%)"; RealType result = 0; // of argument checks: RealType a = dist.alpha(); @@ -530,12 +531,12 @@ namespace boost } // quantile template - inline RealType quantile(const complemented2_type, RealType>& c) + BOOST_MATH_GPU_ENABLED inline RealType quantile(const complemented2_type, RealType>& c) { // Complement Quantile or Percent Point beta function . // Return the number of expected x for a given // complement of the probability q. - static const char* function = "boost::math::quantile(beta_distribution<%1%> const&, %1%)"; + constexpr auto function = "boost::math::quantile(beta_distribution<%1%> const&, %1%)"; // // Error checks: diff --git a/include/boost/math/policies/error_handling.hpp b/include/boost/math/policies/error_handling.hpp index ce3f1e7ccd..559e70a2f4 100644 --- a/include/boost/math/policies/error_handling.hpp +++ b/include/boost/math/policies/error_handling.hpp @@ -12,6 +12,7 @@ #include #include #include +#include #ifndef BOOST_MATH_HAS_NVRTC @@ -877,20 +878,6 @@ BOOST_MATH_GPU_ENABLED inline void check_root_iterations(const char* function, s } //namespace policies -namespace detail{ - -// -// Simple helper function to assist in returning a pair from a single value, -// that value usually comes from one of the error handlers above: -// -template -BOOST_MATH_GPU_ENABLED std::pair pair_from_single(const T& val) BOOST_MATH_NOEXCEPT(T) -{ - return std::make_pair(val, val); -} - -} - #ifdef _MSC_VER # pragma warning(pop) #endif @@ -1039,7 +1026,21 @@ BOOST_MATH_GPU_ENABLED inline void check_root_iterations(const char* function, b } // namespace math } // namespace boost -#endif +#endif // BOOST_MATH_HAS_NVRTC + +namespace boost { namespace math { namespace detail { + +// +// Simple helper function to assist in returning a pair from a single value, +// that value usually comes from one of the error handlers above: +// +template +BOOST_MATH_GPU_ENABLED boost::math::pair pair_from_single(const T& val) BOOST_MATH_NOEXCEPT(T) +{ + return boost::math::make_pair(val, val); +} + +}}} // boost::math::detail #endif // BOOST_MATH_POLICY_ERROR_HANDLING_HPP diff --git a/include/boost/math/special_functions/beta.hpp b/include/boost/math/special_functions/beta.hpp index 00b8e45bf2..27901a1131 100644 --- a/include/boost/math/special_functions/beta.hpp +++ b/include/boost/math/special_functions/beta.hpp @@ -28,14 +28,10 @@ #include #include #include - -#ifndef BOOST_MATH_HAS_NVRTC #include #include #include #include -#include -#endif namespace boost{ namespace math{ @@ -800,7 +796,7 @@ BOOST_MATH_GPU_ENABLED T ibeta_series(T a, T b, T x, T s0, const boost::math::la policies::check_series_iterations("boost::math::ibeta<%1%>(%1%, %1%, %1%) in ibeta_series (without lanczos)", max_iter, pol); return result; } - +#endif // // Continued fraction for the incomplete beta: // @@ -884,7 +880,7 @@ BOOST_MATH_GPU_ENABLED T ibeta_a_step(T a, T b, T x, T y, int k, const Policy& p return prefix; } -#endif + // // This function is only needed for the non-regular incomplete beta, // it computes the delta in: @@ -958,7 +954,6 @@ struct Pn_size #endif }; -#ifndef BOOST_MATH_HAS_GPU_SUPPORT template BOOST_MATH_GPU_ENABLED T beta_small_b_large_a_series(T a, T b, T x, T y, T s0, T mult, const Policy& pol, bool normalised) { @@ -1060,7 +1055,7 @@ BOOST_MATH_GPU_ENABLED T beta_small_b_large_a_series(T a, T b, T x, T y, T s0, T } return sum; } // template T beta_small_b_large_a_series(T a, T b, T x, T y, T s0, T mult, const Lanczos& l, bool normalised) -#endif + // // For integer arguments we can relate the incomplete beta to the // complement of the binomial distribution cdf and use this finite sum. @@ -1130,6 +1125,7 @@ BOOST_MATH_GPU_ENABLED T binomial_ccdf(T n, T k, T x, T y, const Policy& pol) // input range and select the right implementation method for // each domain: // + template BOOST_MATH_GPU_ENABLED T ibeta_imp(T a, T b, T x, const Policy& pol, bool inv, bool normalised, T* p_derivative) { @@ -1749,12 +1745,7 @@ BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type } // namespace math } // namespace boost -// TODO(mborland): Get the ibeta_inv working on NVRTC -#ifndef BOOST_MATH_HAS_NVRTC - #include #include -#endif - #endif // BOOST_MATH_SPECIAL_BETA_HPP diff --git a/include/boost/math/special_functions/binomial.hpp b/include/boost/math/special_functions/binomial.hpp index e776a90bb8..3c49ff30d5 100644 --- a/include/boost/math/special_functions/binomial.hpp +++ b/include/boost/math/special_functions/binomial.hpp @@ -10,20 +10,21 @@ #pragma once #endif +#include +#include #include #include #include #include -#include namespace boost{ namespace math{ template -T binomial_coefficient(unsigned n, unsigned k, const Policy& pol) +BOOST_MATH_GPU_ENABLED T binomial_coefficient(unsigned n, unsigned k, const Policy& pol) { - static_assert(!std::is_integral::value, "Type T must not be an integral type"); + static_assert(!boost::math::is_integral::value, "Type T must not be an integral type"); BOOST_MATH_STD_USING - static const char* function = "boost::math::binomial_coefficient<%1%>(unsigned, unsigned)"; + constexpr auto function = "boost::math::binomial_coefficient<%1%>(unsigned, unsigned)"; if(k > n) return policies::raise_domain_error(function, "The binomial coefficient is undefined for k > n, but got k = %1%.", static_cast(k), pol); T result; // LCOV_EXCL_LINE @@ -43,9 +44,9 @@ T binomial_coefficient(unsigned n, unsigned k, const Policy& pol) { // Use the beta function: if(k < n - k) - result = static_cast(k * beta(static_cast(k), static_cast(n-k+1), pol)); + result = static_cast(k * boost::math::beta(static_cast(k), static_cast(n-k+1), pol)); else - result = static_cast((n - k) * beta(static_cast(k+1), static_cast(n-k), pol)); + result = static_cast((n - k) * boost::math::beta(static_cast(k+1), static_cast(n-k), pol)); if(result == 0) return policies::raise_overflow_error(function, nullptr, pol); result = 1 / result; @@ -59,7 +60,7 @@ T binomial_coefficient(unsigned n, unsigned k, const Policy& pol) // we'll promote to double: // template <> -inline float binomial_coefficient >(unsigned n, unsigned k, const policies::policy<>&) +BOOST_MATH_GPU_ENABLED inline float binomial_coefficient >(unsigned n, unsigned k, const policies::policy<>&) { typedef policies::normalise< policies::policy<>, @@ -71,7 +72,7 @@ inline float binomial_coefficient >(unsigned n, unsign } template -inline T binomial_coefficient(unsigned n, unsigned k) +BOOST_MATH_GPU_ENABLED inline T binomial_coefficient(unsigned n, unsigned k) { return binomial_coefficient(n, k, policies::policy<>()); } diff --git a/include/boost/math/special_functions/cbrt.hpp b/include/boost/math/special_functions/cbrt.hpp index fb05996cf1..7fdf78d014 100644 --- a/include/boost/math/special_functions/cbrt.hpp +++ b/include/boost/math/special_functions/cbrt.hpp @@ -11,15 +11,16 @@ #pragma once #endif -#ifndef __CUDACC_RTC__ - #include + +#ifndef BOOST_MATH_HAS_NVRTC + #include +#include +#include #include #include #include -#include -#include namespace boost{ namespace math{ @@ -174,19 +175,30 @@ BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type cbrt(T z) } // namespace math } // namespace boost -#else +#else // Special NVRTC handling namespace boost { namespace math { template -__host__ __device__ T cbrt(T x) +BOOST_MATH_GPU_ENABLED double cbrt(T x) +{ + return ::cbrt(x); +} + +BOOST_MATH_GPU_ENABLED inline float cbrt(float x) +{ + return ::cbrtf(x); +} + +template +BOOST_MATH_GPU_ENABLED double cbrt(T x, const Policy&) { return ::cbrt(x); } -template <> -__host__ __device__ float cbrt(float x) +template +BOOST_MATH_GPU_ENABLED float cbrt(float x, const Policy&) { return ::cbrtf(x); } diff --git a/include/boost/math/special_functions/detail/ibeta_inv_ab.hpp b/include/boost/math/special_functions/detail/ibeta_inv_ab.hpp index 9e30db2a37..aab18f50f1 100644 --- a/include/boost/math/special_functions/detail/ibeta_inv_ab.hpp +++ b/include/boost/math/special_functions/detail/ibeta_inv_ab.hpp @@ -17,17 +17,19 @@ #pragma once #endif -#include -#include +#include #include +#include +#include +#include namespace boost{ namespace math{ namespace detail{ template struct beta_inv_ab_t { - beta_inv_ab_t(T b_, T z_, T p_, bool invert_, bool swap_ab_) : b(b_), z(z_), p(p_), invert(invert_), swap_ab(swap_ab_) {} - T operator()(T a) + BOOST_MATH_GPU_ENABLED beta_inv_ab_t(T b_, T z_, T p_, bool invert_, bool swap_ab_) : b(b_), z(z_), p(p_), invert(invert_), swap_ab(swap_ab_) {} + BOOST_MATH_GPU_ENABLED T operator()(T a) { return invert ? p - boost::math::ibetac(swap_ab ? b : a, swap_ab ? a : b, z, Policy()) @@ -39,7 +41,7 @@ struct beta_inv_ab_t }; template -T inverse_negative_binomial_cornish_fisher(T n, T sf, T sfc, T p, T q, const Policy& pol) +BOOST_MATH_GPU_ENABLED T inverse_negative_binomial_cornish_fisher(T n, T sf, T sfc, T p, T q, const Policy& pol) { BOOST_MATH_STD_USING // mean: @@ -72,7 +74,7 @@ T inverse_negative_binomial_cornish_fisher(T n, T sf, T sfc, T p, T q, const Pol } template -T ibeta_inv_ab_imp(const T& b, const T& z, const T& p, const T& q, bool swap_ab, const Policy& pol) +BOOST_MATH_GPU_ENABLED T ibeta_inv_ab_imp(const T& b, const T& z, const T& p, const T& q, bool swap_ab, const Policy& pol) { BOOST_MATH_STD_USING // for ADL of std lib math functions // @@ -121,11 +123,11 @@ T ibeta_inv_ab_imp(const T& b, const T& z, const T& p, const T& q, bool swap_ab, // if((p < q) != swap_ab) { - guess = (std::min)(T(b * 2), T(1)); + guess = BOOST_MATH_GPU_SAFE_MIN(T(b * 2), T(1)); } else { - guess = (std::min)(T(b / 2), T(1)); + guess = BOOST_MATH_GPU_SAFE_MIN(T(b / 2), T(1)); } } if(n * n * n * u * sf > 0.005) @@ -138,11 +140,11 @@ T ibeta_inv_ab_imp(const T& b, const T& z, const T& p, const T& q, bool swap_ab, // if((p < q) != swap_ab) { - guess = (std::min)(T(b * 2), T(10)); + guess = BOOST_MATH_GPU_SAFE_MIN(T(b * 2), T(10)); } else { - guess = (std::min)(T(b / 2), T(10)); + guess = BOOST_MATH_GPU_SAFE_MIN(T(b / 2), T(10)); } } else @@ -151,8 +153,8 @@ T ibeta_inv_ab_imp(const T& b, const T& z, const T& p, const T& q, bool swap_ab, // // Max iterations permitted: // - std::uintmax_t max_iter = policies::get_max_root_iterations(); - std::pair r = bracket_and_solve_root(f, guess, factor, swap_ab ? true : false, tol, max_iter, pol); + boost::math::uintmax_t max_iter = policies::get_max_root_iterations(); + boost::math::pair r = bracket_and_solve_root(f, guess, factor, swap_ab ? true : false, tol, max_iter, pol); if(max_iter >= policies::get_max_root_iterations()) return policies::raise_evaluation_error("boost::math::ibeta_invab_imp<%1%>(%1%,%1%,%1%)", "Unable to locate the root within a reasonable number of iterations, closest approximation so far was %1%", r.first, pol); return (r.first + r.second) / 2; @@ -161,7 +163,7 @@ T ibeta_inv_ab_imp(const T& b, const T& z, const T& p, const T& q, bool swap_ab, } // namespace detail template -typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED typename tools::promote_args::type ibeta_inva(RT1 b, RT2 x, RT3 p, const Policy& pol) { typedef typename tools::promote_args::type result_type; @@ -194,7 +196,7 @@ typename tools::promote_args::type } template -typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED typename tools::promote_args::type ibetac_inva(RT1 b, RT2 x, RT3 q, const Policy& pol) { typedef typename tools::promote_args::type result_type; @@ -227,7 +229,7 @@ typename tools::promote_args::type } template -typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED typename tools::promote_args::type ibeta_invb(RT1 a, RT2 x, RT3 p, const Policy& pol) { typedef typename tools::promote_args::type result_type; @@ -260,7 +262,7 @@ typename tools::promote_args::type } template -typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED typename tools::promote_args::type ibetac_invb(RT1 a, RT2 x, RT3 q, const Policy& pol) { constexpr auto function = "boost::math::ibeta_invb<%1%>(%1%, %1%, %1%)"; @@ -293,28 +295,28 @@ typename tools::promote_args::type } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ibeta_inva(RT1 b, RT2 x, RT3 p) { return boost::math::ibeta_inva(b, x, p, policies::policy<>()); } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ibetac_inva(RT1 b, RT2 x, RT3 q) { return boost::math::ibetac_inva(b, x, q, policies::policy<>()); } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ibeta_invb(RT1 a, RT2 x, RT3 p) { return boost::math::ibeta_invb(a, x, p, policies::policy<>()); } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ibetac_invb(RT1 a, RT2 x, RT3 q) { return boost::math::ibetac_invb(a, x, q, policies::policy<>()); diff --git a/include/boost/math/special_functions/detail/ibeta_inverse.hpp b/include/boost/math/special_functions/detail/ibeta_inverse.hpp index 9e4fb08d4e..90f6e90705 100644 --- a/include/boost/math/special_functions/detail/ibeta_inverse.hpp +++ b/include/boost/math/special_functions/detail/ibeta_inverse.hpp @@ -11,12 +11,14 @@ #pragma once #endif +#include +#include +#include +#include #include #include -#include #include #include -#include namespace boost{ namespace math{ namespace detail{ @@ -27,12 +29,12 @@ namespace boost{ namespace math{ namespace detail{ template struct temme_root_finder { - temme_root_finder(const T t_, const T a_) : t(t_), a(a_) { + BOOST_MATH_GPU_ENABLED temme_root_finder(const T t_, const T a_) : t(t_), a(a_) { BOOST_MATH_ASSERT( math::tools::epsilon() <= a && !(boost::math::isinf)(a)); } - boost::math::tuple operator()(T x) + BOOST_MATH_GPU_ENABLED boost::math::tuple operator()(T x) { BOOST_MATH_STD_USING // ADL of std names @@ -52,7 +54,7 @@ struct temme_root_finder // Section 2. // template -T temme_method_1_ibeta_inverse(T a, T b, T z, const Policy& pol) +BOOST_MATH_GPU_ENABLED T temme_method_1_ibeta_inverse(T a, T b, T z, const Policy& pol) { BOOST_MATH_STD_USING // ADL of std names @@ -138,7 +140,7 @@ T temme_method_1_ibeta_inverse(T a, T b, T z, const Policy& pol) // Section 3. // template -T temme_method_2_ibeta_inverse(T /*a*/, T /*b*/, T z, T r, T theta, const Policy& pol) +BOOST_MATH_GPU_ENABLED T temme_method_2_ibeta_inverse(T /*a*/, T /*b*/, T z, T r, T theta, const Policy& pol) { BOOST_MATH_STD_USING // ADL of std names @@ -315,7 +317,7 @@ T temme_method_2_ibeta_inverse(T /*a*/, T /*b*/, T z, T r, T theta, const Policy // Section 4. // template -T temme_method_3_ibeta_inverse(T a, T b, T p, T q, const Policy& pol) +BOOST_MATH_GPU_ENABLED T temme_method_3_ibeta_inverse(T a, T b, T p, T q, const Policy& pol) { BOOST_MATH_STD_USING // ADL of std names @@ -420,10 +422,10 @@ T temme_method_3_ibeta_inverse(T a, T b, T p, T q, const Policy& pol) template struct ibeta_roots { - ibeta_roots(T _a, T _b, T t, bool inv = false) + BOOST_MATH_GPU_ENABLED ibeta_roots(T _a, T _b, T t, bool inv = false) : a(_a), b(_b), target(t), invert(inv) {} - boost::math::tuple operator()(T x) + BOOST_MATH_GPU_ENABLED boost::math::tuple operator()(T x) { BOOST_MATH_STD_USING // ADL of std names @@ -457,7 +459,7 @@ struct ibeta_roots }; template -T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py) +BOOST_MATH_GPU_ENABLED T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py) { BOOST_MATH_STD_USING // For ADL of math functions. @@ -487,8 +489,8 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py) return p; } // Change things around so we can handle as b == 1 special case below: - std::swap(a, b); - std::swap(p, q); + BOOST_MATH_GPU_SAFE_SWAP(a, b); + BOOST_MATH_GPU_SAFE_SWAP(p, q); invert = true; } // @@ -524,8 +526,8 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py) } else if(b > 0.5f) { - std::swap(a, b); - std::swap(p, q); + BOOST_MATH_GPU_SAFE_SWAP(a, b); + BOOST_MATH_GPU_SAFE_SWAP(p, q); invert = !invert; } } @@ -559,7 +561,7 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py) y = -boost::math::expm1(boost::math::log1p(-q, pol) / a, pol); } if(invert) - std::swap(x, y); + BOOST_MATH_GPU_SAFE_SWAP(x, y); if(py) *py = y; return x; @@ -574,12 +576,12 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py) // if(p > 0.5) { - std::swap(a, b); - std::swap(p, q); + BOOST_MATH_GPU_SAFE_SWAP(a, b); + BOOST_MATH_GPU_SAFE_SWAP(p, q); invert = !invert; } - T minv = (std::min)(a, b); - T maxv = (std::max)(a, b); + T minv = BOOST_MATH_GPU_SAFE_MIN(a, b); + T maxv = BOOST_MATH_GPU_SAFE_MAX(a, b); if((sqrt(minv) > (maxv - minv)) && (minv > 5)) { // @@ -630,8 +632,8 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py) // if(a < b) { - std::swap(a, b); - std::swap(p, q); + BOOST_MATH_GPU_SAFE_SWAP(a, b); + BOOST_MATH_GPU_SAFE_SWAP(p, q); invert = !invert; } // @@ -694,8 +696,8 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py) } if(fs < 0) { - std::swap(a, b); - std::swap(p, q); + BOOST_MATH_GPU_SAFE_SWAP(a, b); + BOOST_MATH_GPU_SAFE_SWAP(p, q); invert = !invert; xs = 1 - xs; } @@ -758,9 +760,9 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py) if(ps < 0) { - std::swap(a, b); - std::swap(p, q); - std::swap(xs, xs2); + BOOST_MATH_GPU_SAFE_SWAP(a, b); + BOOST_MATH_GPU_SAFE_SWAP(p, q); + BOOST_MATH_GPU_SAFE_SWAP(xs, xs2); invert = !invert; } // @@ -823,8 +825,8 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py) // if(b < a) { - std::swap(a, b); - std::swap(p, q); + BOOST_MATH_GPU_SAFE_SWAP(a, b); + BOOST_MATH_GPU_SAFE_SWAP(p, q); invert = !invert; } if (a < tools::min_value()) @@ -890,9 +892,9 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py) // if(x > 0.5) { - std::swap(a, b); - std::swap(p, q); - std::swap(x, y); + BOOST_MATH_GPU_SAFE_SWAP(a, b); + BOOST_MATH_GPU_SAFE_SWAP(p, q); + BOOST_MATH_GPU_SAFE_SWAP(x, y); invert = !invert; T l = 1 - upper; T u = 1 - lower; @@ -922,8 +924,8 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py) if(x < lower) x = lower; } - std::uintmax_t max_iter = policies::get_max_root_iterations(); - std::uintmax_t max_iter_used = 0; + boost::math::uintmax_t max_iter = policies::get_max_root_iterations(); + boost::math::uintmax_t max_iter_used = 0; // // Figure out how many digits to iterate towards: // @@ -946,7 +948,13 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py) // Now iterate, we can use either p or q as the target here // depending on which is smaller: // + // Since we can't use halley_iterate on device we use newton raphson + // + #ifndef BOOST_MATH_HAS_GPU_SUPPORT x = boost::math::tools::halley_iterate( + #else + x = boost::math::tools::newton_raphson_iterate( + #endif boost::math::detail::ibeta_roots(a, b, (p < q ? p : q), (p < q ? false : true)), x, lower, upper, digits, max_iter); policies::check_root_iterations("boost::math::ibeta<%1%>(%1%, %1%, %1%)", max_iter + max_iter_used, pol); // @@ -968,7 +976,7 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py) } // namespace detail template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ibeta_inv(T1 a, T2 b, T3 p, T4* py, const Policy& pol) { constexpr auto function = "boost::math::ibeta_inv<%1%>(%1%,%1%,%1%)"; @@ -1003,14 +1011,14 @@ inline typename tools::promote_args::type } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ibeta_inv(T1 a, T2 b, T3 p, T4* py) { return ibeta_inv(a, b, p, py, policies::policy<>()); } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ibeta_inv(T1 a, T2 b, T3 p) { typedef typename tools::promote_args::type result_type; @@ -1018,7 +1026,7 @@ inline typename tools::promote_args::type } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ibeta_inv(T1 a, T2 b, T3 p, const Policy& pol) { typedef typename tools::promote_args::type result_type; @@ -1026,7 +1034,7 @@ inline typename tools::promote_args::type } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ibetac_inv(T1 a, T2 b, T3 q, T4* py, const Policy& pol) { constexpr auto function = "boost::math::ibetac_inv<%1%>(%1%,%1%,%1%)"; @@ -1061,14 +1069,14 @@ inline typename tools::promote_args::type } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ibetac_inv(T1 a, T2 b, T3 q, T4* py) { return ibetac_inv(a, b, q, py, policies::policy<>()); } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ibetac_inv(RT1 a, RT2 b, RT3 q) { typedef typename tools::promote_args::type result_type; @@ -1076,7 +1084,7 @@ inline typename tools::promote_args::type } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ibetac_inv(RT1 a, RT2 b, RT3 q, const Policy& pol) { typedef typename tools::promote_args::type result_type; diff --git a/include/boost/math/special_functions/detail/t_distribution_inv.hpp b/include/boost/math/special_functions/detail/t_distribution_inv.hpp index 9209b6d405..22e1d11f02 100644 --- a/include/boost/math/special_functions/detail/t_distribution_inv.hpp +++ b/include/boost/math/special_functions/detail/t_distribution_inv.hpp @@ -11,6 +11,9 @@ #pragma once #endif +#include +#include +#include #include #include #include @@ -24,7 +27,7 @@ namespace boost{ namespace math{ namespace detail{ // Communications of the ACM, 13(10): 619-620, Oct., 1970. // template -T inverse_students_t_hill(T ndf, T u, const Policy& pol) +BOOST_MATH_GPU_ENABLED T inverse_students_t_hill(T ndf, T u, const Policy& pol) { BOOST_MATH_STD_USING BOOST_MATH_ASSERT(u <= 0.5); @@ -74,7 +77,7 @@ T inverse_students_t_hill(T ndf, T u, const Policy& pol) // Journal of Computational Finance, Vol 9 Issue 4, pp 37-73, Summer 2006 // template -T inverse_students_t_tail_series(T df, T v, const Policy& pol) +BOOST_MATH_GPU_ENABLED T inverse_students_t_tail_series(T df, T v, const Policy& pol) { BOOST_MATH_STD_USING // Tail series expansion, see section 6 of Shaw's paper. @@ -125,7 +128,7 @@ T inverse_students_t_tail_series(T df, T v, const Policy& pol) } template -T inverse_students_t_body_series(T df, T u, const Policy& pol) +BOOST_MATH_GPU_ENABLED T inverse_students_t_body_series(T df, T u, const Policy& pol) { BOOST_MATH_STD_USING // @@ -204,7 +207,7 @@ T inverse_students_t_body_series(T df, T u, const Policy& pol) } template -T inverse_students_t(T df, T u, T v, const Policy& pol, bool* pexact = nullptr) +BOOST_MATH_GPU_ENABLED T inverse_students_t(T df, T u, T v, const Policy& pol, bool* pexact = nullptr) { // // df = number of degrees of freedom. @@ -220,7 +223,7 @@ T inverse_students_t(T df, T u, T v, const Policy& pol, bool* pexact = nullptr) if(u > v) { // function is symmetric, invert it: - std::swap(u, v); + BOOST_MATH_GPU_SAFE_SWAP(u, v); invert = true; } if((floor(df) == df) && (df < 20)) @@ -416,7 +419,7 @@ T inverse_students_t(T df, T u, T v, const Policy& pol, bool* pexact = nullptr) } template -inline T find_ibeta_inv_from_t_dist(T a, T p, T /*q*/, T* py, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T find_ibeta_inv_from_t_dist(T a, T p, T /*q*/, T* py, const Policy& pol) { T u = p / 2; T v = 1 - u; @@ -427,7 +430,7 @@ inline T find_ibeta_inv_from_t_dist(T a, T p, T /*q*/, T* py, const Policy& pol) } template -inline T fast_students_t_quantile_imp(T df, T p, const Policy& pol, const std::false_type*) +BOOST_MATH_GPU_ENABLED inline T fast_students_t_quantile_imp(T df, T p, const Policy& pol, const boost::math::false_type*) { BOOST_MATH_STD_USING // @@ -450,12 +453,12 @@ inline T fast_students_t_quantile_imp(T df, T p, const Policy& pol, const std::f } template -T fast_students_t_quantile_imp(T df, T p, const Policy& pol, const std::true_type*) +BOOST_MATH_GPU_ENABLED T fast_students_t_quantile_imp(T df, T p, const Policy& pol, const boost::math::true_type*) { BOOST_MATH_STD_USING bool invert = false; if((df < 2) && (floor(df) != df)) - return boost::math::detail::fast_students_t_quantile_imp(df, p, pol, static_cast(nullptr)); + return boost::math::detail::fast_students_t_quantile_imp(df, p, pol, static_cast(nullptr)); if(p > 0.5) { p = 1 - p; @@ -521,7 +524,7 @@ T fast_students_t_quantile_imp(T df, T p, const Policy& pol, const std::true_typ } template -inline T fast_students_t_quantile(T df, T p, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T fast_students_t_quantile(T df, T p, const Policy& pol) { typedef typename policies::evaluation::type value_type; typedef typename policies::normalise< @@ -531,12 +534,12 @@ inline T fast_students_t_quantile(T df, T p, const Policy& pol) policies::discrete_quantile<>, policies::assert_undefined<> >::type forwarding_policy; - typedef std::integral_constant::digits <= 53) + typedef boost::math::integral_constant::digits <= 53) && - (std::numeric_limits::is_specialized) + (boost::math::numeric_limits::is_specialized) && - (std::numeric_limits::radix == 2) + (boost::math::numeric_limits::radix == 2) > tag_type; return policies::checked_narrowing_cast(fast_students_t_quantile_imp(static_cast(df), static_cast(p), pol, static_cast(nullptr)), "boost::math::students_t_quantile<%1%>(%1%,%1%,%1%)"); } diff --git a/include/boost/math/special_functions/gamma.hpp b/include/boost/math/special_functions/gamma.hpp index 186befb612..41c85936dd 100644 --- a/include/boost/math/special_functions/gamma.hpp +++ b/include/boost/math/special_functions/gamma.hpp @@ -1735,10 +1735,50 @@ BOOST_MATH_GPU_ENABLED T gamma_incomplete_imp(T a, T x, bool normalised, bool in // // Ratios of two gamma functions: // +template +BOOST_MATH_GPU_ENABLED T tgamma_delta_ratio_imp_lanczos_final(T z, T delta, const Policy& pol, const Lanczos&) +{ + BOOST_MATH_STD_USING + + T zgh = static_cast(z + T(Lanczos::g()) - constants::half()); + T result; + if(z + delta == z) + { + if (fabs(delta / zgh) < boost::math::tools::epsilon()) + { + // We have: + // result = exp((constants::half() - z) * boost::math::log1p(delta / zgh, pol)); + // 0.5 - z == -z + // log1p(delta / zgh) = delta / zgh = delta / z + // multiplying we get -delta. + result = exp(-delta); + } + else + // from the pow formula below... but this may actually be wrong, we just can't really calculate it :( + result = 1; + } + else + { + if(fabs(delta) < 10) + { + result = exp((constants::half() - z) * boost::math::log1p(delta / zgh, pol)); + } + else + { + result = pow(T(zgh / (zgh + delta)), T(z - constants::half())); + } + // Split the calculation up to avoid spurious overflow: + result *= Lanczos::lanczos_sum(z) / Lanczos::lanczos_sum(T(z + delta)); + } + result *= pow(T(constants::e() / (zgh + delta)), delta); + return result; +} + template BOOST_MATH_GPU_ENABLED T tgamma_delta_ratio_imp_lanczos(T z, T delta, const Policy& pol, const Lanczos& l) { BOOST_MATH_STD_USING + if(z < tools::epsilon()) { // @@ -1752,7 +1792,7 @@ BOOST_MATH_GPU_ENABLED T tgamma_delta_ratio_imp_lanczos(T z, T delta, const Poli // if(boost::math::max_factorial::value < delta) { - T ratio = tgamma_delta_ratio_imp_lanczos(delta, T(boost::math::max_factorial::value - delta), pol, l); + T ratio = tgamma_delta_ratio_imp_lanczos_final(T(delta), T(boost::math::max_factorial::value - delta), pol, l); ratio *= z; ratio *= boost::math::unchecked_factorial(boost::math::max_factorial::value - 1); return 1 / ratio; @@ -1773,39 +1813,10 @@ BOOST_MATH_GPU_ENABLED T tgamma_delta_ratio_imp_lanczos(T z, T delta, const Poli #endif } } - T zgh = static_cast(z + T(Lanczos::g()) - constants::half()); - T result; - if(z + delta == z) - { - if (fabs(delta / zgh) < boost::math::tools::epsilon()) - { - // We have: - // result = exp((constants::half() - z) * boost::math::log1p(delta / zgh, pol)); - // 0.5 - z == -z - // log1p(delta / zgh) = delta / zgh = delta / z - // multiplying we get -delta. - result = exp(-delta); - } - else - // from the pow formula below... but this may actually be wrong, we just can't really calculate it :( - result = 1; - } - else - { - if(fabs(delta) < 10) - { - result = exp((constants::half() - z) * boost::math::log1p(delta / zgh, pol)); - } - else - { - result = pow(T(zgh / (zgh + delta)), T(z - constants::half())); - } - // Split the calculation up to avoid spurious overflow: - result *= Lanczos::lanczos_sum(z) / Lanczos::lanczos_sum(T(z + delta)); - } - result *= pow(T(constants::e() / (zgh + delta)), delta); - return result; + + return tgamma_delta_ratio_imp_lanczos_final(T(z), T(delta), pol, l); } + // // And again without Lanczos support this time: // diff --git a/include/boost/math/special_functions/math_fwd.hpp b/include/boost/math/special_functions/math_fwd.hpp index 16ae3b61eb..e3a2722c3d 100644 --- a/include/boost/math/special_functions/math_fwd.hpp +++ b/include/boost/math/special_functions/math_fwd.hpp @@ -26,7 +26,19 @@ #include -#ifndef BOOST_MATH_HAS_NVRTC +#ifdef BOOST_MATH_HAS_NVRTC + +namespace boost { +namespace math { + +template +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type +beta(RT1 a, RT2 b, A arg); + +} // namespace math +} // namespace boost + +#else #include #include @@ -154,9 +166,9 @@ namespace boost // Binomial: template - T binomial_coefficient(unsigned n, unsigned k, const Policy& pol); + BOOST_MATH_GPU_ENABLED T binomial_coefficient(unsigned n, unsigned k, const Policy& pol); template - T binomial_coefficient(unsigned n, unsigned k); + BOOST_MATH_GPU_ENABLED T binomial_coefficient(unsigned n, unsigned k); // erf & erfc error functions. template // Error function. @@ -874,19 +886,19 @@ namespace boost BOOST_MATH_GPU_ENABLED tools::promote_args_t cos_pi(T x); template - int fpclassify BOOST_NO_MACRO_EXPAND(T t); + BOOST_MATH_GPU_ENABLED int fpclassify BOOST_NO_MACRO_EXPAND(T t); template - bool isfinite BOOST_NO_MACRO_EXPAND(T z); + BOOST_MATH_GPU_ENABLED bool isfinite BOOST_NO_MACRO_EXPAND(T z); template - bool isinf BOOST_NO_MACRO_EXPAND(T t); + BOOST_MATH_GPU_ENABLED bool isinf BOOST_NO_MACRO_EXPAND(T t); template - bool isnan BOOST_NO_MACRO_EXPAND(T t); + BOOST_MATH_GPU_ENABLED bool isnan BOOST_NO_MACRO_EXPAND(T t); template - bool isnormal BOOST_NO_MACRO_EXPAND(T t); + BOOST_MATH_GPU_ENABLED bool isnormal BOOST_NO_MACRO_EXPAND(T t); template BOOST_MATH_GPU_ENABLED int signbit BOOST_NO_MACRO_EXPAND(T x); @@ -1218,62 +1230,62 @@ namespace boost BOOST_MATH_DETAIL_11_FUNC(Policy)\ \ template \ - inline boost::math::tools::promote_args_t \ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t \ beta(RT1 a, RT2 b) { return ::boost::math::beta(a, b, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t \ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t \ beta(RT1 a, RT2 b, A x){ return ::boost::math::beta(a, b, x, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t \ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t \ betac(RT1 a, RT2 b, RT3 x) { return ::boost::math::betac(a, b, x, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t \ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t \ ibeta(RT1 a, RT2 b, RT3 x){ return ::boost::math::ibeta(a, b, x, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t \ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t \ ibetac(RT1 a, RT2 b, RT3 x){ return ::boost::math::ibetac(a, b, x, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t \ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t \ ibeta_inv(T1 a, T2 b, T3 p, T4* py){ return ::boost::math::ibeta_inv(a, b, p, py, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t \ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t \ ibeta_inv(RT1 a, RT2 b, RT3 p){ return ::boost::math::ibeta_inv(a, b, p, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t \ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t \ ibetac_inv(T1 a, T2 b, T3 q, T4* py){ return ::boost::math::ibetac_inv(a, b, q, py, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t \ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t \ ibeta_inva(RT1 a, RT2 b, RT3 p){ return ::boost::math::ibeta_inva(a, b, p, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t \ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t \ ibetac_inva(T1 a, T2 b, T3 q){ return ::boost::math::ibetac_inva(a, b, q, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t \ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t \ ibeta_invb(RT1 a, RT2 b, RT3 p){ return ::boost::math::ibeta_invb(a, b, p, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t \ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t \ ibetac_invb(T1 a, T2 b, T3 q){ return ::boost::math::ibetac_invb(a, b, q, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t \ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t \ ibetac_inv(RT1 a, RT2 b, RT3 q){ return ::boost::math::ibetac_inv(a, b, q, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t \ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t \ ibeta_derivative(RT1 a, RT2 b, RT3 x){ return ::boost::math::ibeta_derivative(a, b, x, Policy()); }\ \ - template T binomial_coefficient(unsigned n, unsigned k){ return ::boost::math::binomial_coefficient(n, k, Policy()); }\ + template BOOST_MATH_GPU_ENABLED T binomial_coefficient(unsigned n, unsigned k){ return ::boost::math::binomial_coefficient(n, k, Policy()); }\ \ template \ BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t erf(RT z) { return ::boost::math::erf(z, Policy()); }\ diff --git a/include/boost/math/tools/config.hpp b/include/boost/math/tools/config.hpp index a2af127630..fac8f84db0 100644 --- a/include/boost/math/tools/config.hpp +++ b/include/boost/math/tools/config.hpp @@ -800,7 +800,7 @@ BOOST_MATH_GPU_ENABLED constexpr T gpu_safe_max(const T& a, const T& b) { return #define BOOST_MATH_BIG_CONSTANT(T, N, V) static_cast(V) #define BOOST_MATH_FORCEINLINE __forceinline__ #define BOOST_MATH_STD_USING -#define BOOST_MATH_IF_CONSTEXPR if constexpr +#define BOOST_MATH_IF_CONSTEXPR if #define BOOST_MATH_IS_FLOAT(T) (boost::math::is_floating_point::value) #define BOOST_MATH_CONSTEXPR_TABLE_FUNCTION constexpr #define BOOST_MATH_NO_EXCEPTIONS diff --git a/test/cuda_jamfile b/test/cuda_jamfile index 1fb55da197..c9a70e8a99 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -25,6 +25,13 @@ run test_bernoulli_pdf_float.cu ; run test_bernoulli_range_support_double.cu ; run test_bernoulli_range_support_float.cu ; +run test_beta_dist_cdf_double.cu ; +run test_beta_dist_cdf_float.cu ; +run test_beta_dist_pdf_double.cu ; +run test_beta_dist_pdf_float.cu ; +run test_beta_dist_quan_double.cu ; +run test_beta_dist_quan_float.cu ; + run test_cauchy_cdf_double.cu ; run test_cauchy_cdf_float.cu ; run test_cauchy_pdf_double.cu ; @@ -107,6 +114,24 @@ run test_weibull_quan_float.cu ; # Special Functions run test_beta_double.cu ; run test_beta_float.cu ; +run test_betac_double.cu ; +run test_betac_float.cu ; +run test_ibeta_double.cu ; +run test_ibeta_float.cu ; +run test_ibeta_derivative_double.cu ; +run test_ibeta_derivative_float.cu ; +run test_ibeta_inv_double.cu ; +run test_ibeta_inv_float.cu ; +run test_ibeta_inva_double.cu ; +run test_ibeta_inva_float.cu ; +run test_ibeta_invb_double.cu ; +run test_ibeta_invb_float.cu ; +run test_ibetac_inv_double.cu ; +run test_ibetac_inv_float.cu ; +run test_ibetac_inva_double.cu ; +run test_ibetac_inva_float.cu ; +run test_ibetac_invb_double.cu ; +run test_ibetac_invb_float.cu ; run test_bessel_i0_double.cu ; run test_bessel_i0_float.cu ; diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index 4a37960a51..1fc2746a1f 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -24,6 +24,13 @@ run test_bernoulli_pdf_nvrtc_float.cpp ; run test_bernoulli_quan_nvrtc_double.cpp ; run test_bernoulli_quan_nvrtc_float.cpp ; +run test_beta_dist_cdf_nvrtc_double.cpp ; +run test_beta_dist_cdf_nvrtc_float.cpp ; +run test_beta_dist_pdf_nvrtc_double.cpp ; +run test_beta_dist_pdf_nvrtc_float.cpp ; +run test_beta_dist_quan_nvrtc_double.cpp ; +run test_beta_dist_quan_nvrtc_float.cpp ; + run test_cauchy_cdf_nvrtc_double.cpp ; run test_cauchy_cdf_nvrtc_float.cpp ; run test_cauchy_pdf_nvrtc_double.cpp ; @@ -104,6 +111,26 @@ run test_weibull_quan_nvrtc_float.cpp ; # Special Functions run test_beta_nvrtc_double.cpp ; run test_beta_nvrtc_float.cpp ; +run test_betac_nvrtc_double.cpp ; +run test_betac_nvrtc_float.cpp ; +run test_ibeta_nvrtc_double.cpp ; +run test_ibeta_nvrtc_float.cpp ; +run test_ibetac_nvrtc_double.cpp ; +run test_ibetac_nvrtc_float.cpp ; +run test_ibeta_derivative_nvrtc_double.cpp ; +run test_ibeta_derivative_nvrtc_float.cpp ; +run test_ibeta_inv_nvrtc_double.cpp ; +run test_ibeta_inv_nvrtc_float.cpp ; +run test_ibeta_inva_nvrtc_double.cpp ; +run test_ibeta_inva_nvrtc_float.cpp ; +run test_ibeta_invb_nvrtc_double.cpp ; +run test_ibeta_invb_nvrtc_float.cpp ; +run test_ibetac_inv_nvrtc_double.cpp ; +run test_ibetac_inv_nvrtc_float.cpp ; +run test_ibetac_inva_nvrtc_double.cpp ; +run test_ibetac_inva_nvrtc_float.cpp ; +run test_ibetac_invb_nvrtc_double.cpp ; +run test_ibetac_invb_nvrtc_float.cpp ; run test_bessel_i0_nvrtc_double.cpp ; run test_bessel_i0_nvrtc_float.cpp ; diff --git a/test/sycl_jamfile b/test/sycl_jamfile index 2fd5954ae1..5d3d85cd8f 100644 --- a/test/sycl_jamfile +++ b/test/sycl_jamfile @@ -12,6 +12,7 @@ project : requirements # Distributions run test_arcsine.cpp ; run test_bernoulli.cpp ; +run test_beta_dist.cpp ; run test_cauchy.cpp ; run test_chi_squared.cpp ; run test_exponential_dist.cpp ; @@ -28,6 +29,10 @@ run test_weibull.cpp ; run pow_test.cpp ; run test_beta_simple.cpp ; +run test_beta.cpp ; +run test_ibeta.cpp ; +run test_ibeta_inv.cpp ; +run test_ibeta_inv_ab.cpp ; run test_bessel_i.cpp ; run test_bessel_j.cpp ; diff --git a/test/test_bessel_i.cpp b/test/test_bessel_i.cpp index 09487ddf1b..817569760a 100644 --- a/test/test_bessel_i.cpp +++ b/test/test_bessel_i.cpp @@ -15,7 +15,7 @@ # pragma clang diagnostic ignored "-Wliteral-range" #elif defined(__GNUC__) # pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wliteral-range" +# pragma GCC diagnostic ignored "-Woverflow" #endif #include "test_bessel_i.hpp" diff --git a/test/test_bessel_j.cpp b/test/test_bessel_j.cpp index 31a64bc579..1dd63a68a5 100644 --- a/test/test_bessel_j.cpp +++ b/test/test_bessel_j.cpp @@ -15,7 +15,7 @@ # pragma clang diagnostic ignored "-Wliteral-range" #elif defined(__GNUC__) # pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wliteral-range" +# pragma GCC diagnostic ignored "-Woverflow" #endif #include "test_bessel_j.hpp" diff --git a/test/test_bessel_k.cpp b/test/test_bessel_k.cpp index 84ba0830f2..6c31f5ab05 100644 --- a/test/test_bessel_k.cpp +++ b/test/test_bessel_k.cpp @@ -22,7 +22,7 @@ # pragma clang diagnostic ignored "-Wliteral-range" #elif defined(__GNUC__) # pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wliteral-range" +# pragma GCC diagnostic ignored "-Woverflow" #endif #include "test_bessel_k.hpp" diff --git a/test/test_bessel_y.cpp b/test/test_bessel_y.cpp index 232a903963..8251920c5b 100644 --- a/test/test_bessel_y.cpp +++ b/test/test_bessel_y.cpp @@ -15,7 +15,7 @@ # pragma clang diagnostic ignored "-Wliteral-range" #elif defined(__GNUC__) # pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wliteral-range" +# pragma GCC diagnostic ignored "-Woverflow" #endif #include "test_bessel_y.hpp" diff --git a/test/test_beta.cpp b/test/test_beta.cpp index b24cb32c07..4e27b71353 100644 --- a/test/test_beta.cpp +++ b/test/test_beta.cpp @@ -5,7 +5,17 @@ // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) -#include "pch_light.hpp" +#ifndef SYCL_LANGUAGE_VERSION +#include +#endif + +#ifdef __clang__ +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wliteral-range" +#elif defined(__GNUC__) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Woverflow" +#endif #include "test_beta.hpp" diff --git a/test/test_beta.hpp b/test/test_beta.hpp index e633935a3c..3019c17e71 100644 --- a/test/test_beta.hpp +++ b/test/test_beta.hpp @@ -18,9 +18,10 @@ #define BOOST_TEST_MAIN #include #include +#include #include #include -#include +#include "../include_private/boost/math/tools/test.hpp" #include #include #include @@ -109,9 +110,12 @@ void test_spots(T) // Inexact input, so disable for ultra precise long doubles: BOOST_CHECK_CLOSE(::boost::math::beta(static_cast(0.0125L), static_cast(0.000023L)), static_cast(43558.24045647538375006349016083320744662L), tolerance * 2); } + + #ifndef BOOST_MATH_NO_EXCEPTIONS BOOST_CHECK_THROW(boost::math::beta(static_cast(0), static_cast(1)), std::domain_error); BOOST_CHECK_THROW(boost::math::beta(static_cast(-1), static_cast(1)), std::domain_error); BOOST_CHECK_THROW(boost::math::beta(static_cast(1), static_cast(-1)), std::domain_error); BOOST_CHECK_THROW(boost::math::beta(static_cast(1), static_cast(0)), std::domain_error); + #endif } diff --git a/test/test_beta_dist.cpp b/test/test_beta_dist.cpp index 943718a39f..1652309eb7 100644 --- a/test/test_beta_dist.cpp +++ b/test/test_beta_dist.cpp @@ -32,9 +32,14 @@ # pragma warning (disable : 4224) // nonstandard extension used : formal parameter 'arg' was previously defined as a type. #endif +#include + +#ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS #include // for real_concept using ::boost::math::concepts::real_concept; -#include +#endif + +#include "../include_private/boost/math/tools/test.hpp" #include // for beta_distribution using boost::math::beta_distribution; @@ -634,12 +639,13 @@ BOOST_AUTO_TEST_CASE( test_main ) BOOST_CHECK_CLOSE_FRACTION(mybeta22.find_alpha(mybeta22.beta(), 0.8, cdf(mybeta22, 0.8)), mybeta22.alpha(), tol); BOOST_CHECK_CLOSE_FRACTION(mybeta22.find_beta(mybeta22.alpha(), 0.8, cdf(mybeta22, 0.8)), mybeta22.beta(), tol); - + #ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS beta_distribution rcbeta22(2, 2); // Using RealType real_concept. cout << "numeric_limits::is_specialized " << numeric_limits::is_specialized << endl; cout << "numeric_limits::digits " << numeric_limits::digits << endl; cout << "numeric_limits::digits10 " << numeric_limits::digits10 << endl; cout << "numeric_limits::epsilon " << numeric_limits::epsilon() << endl; + #endif // (Parameter value, arbitrarily zero, only communicates the floating point type). test_spots(0.0F); // Test float. diff --git a/test/test_beta_dist_cdf_double.cu b/test/test_beta_dist_cdf_double.cu new file mode 100644 index 0000000000..fa460244a3 --- /dev/null +++ b/test/test_beta_dist_cdf_double.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = cdf(boost::math::beta_distribution(), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 512; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(cdf(boost::math::beta_distribution(), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_beta_dist_cdf_float.cu b/test/test_beta_dist_cdf_float.cu new file mode 100644 index 0000000000..321c844205 --- /dev/null +++ b/test/test_beta_dist_cdf_float.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = cdf(boost::math::beta_distribution(), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 512; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(cdf(boost::math::beta_distribution(), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_beta_dist_cdf_nvrtc_double.cpp b/test/test_beta_dist_cdf_nvrtc_double.cpp new file mode 100644 index 0000000000..4f5913c108 --- /dev/null +++ b/test/test_beta_dist_cdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_beta_dist_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::beta_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_beta_dist_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_beta_dist_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_beta_dist_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::beta_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_beta_dist_cdf_nvrtc_float.cpp b/test/test_beta_dist_cdf_nvrtc_float.cpp new file mode 100644 index 0000000000..f5b031c5a9 --- /dev/null +++ b/test/test_beta_dist_cdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_beta_dist_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::beta_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_beta_dist_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_beta_dist_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_beta_dist_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::beta_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_beta_dist_pdf_double.cu b/test/test_beta_dist_pdf_double.cu new file mode 100644 index 0000000000..c0ee9272ae --- /dev/null +++ b/test/test_beta_dist_pdf_double.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = pdf(boost::math::beta_distribution(), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 512; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(pdf(boost::math::beta_distribution(), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_beta_dist_pdf_float.cu b/test/test_beta_dist_pdf_float.cu new file mode 100644 index 0000000000..75e4fa27b4 --- /dev/null +++ b/test/test_beta_dist_pdf_float.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = pdf(boost::math::beta_distribution(), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 512; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(pdf(boost::math::beta_distribution(), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_beta_dist_pdf_nvrtc_double.cpp b/test/test_beta_dist_pdf_nvrtc_double.cpp new file mode 100644 index 0000000000..c9870e2ce4 --- /dev/null +++ b/test/test_beta_dist_pdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_beta_dist_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::beta_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_beta_dist_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_beta_dist_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_beta_dist_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::beta_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_beta_dist_pdf_nvrtc_float.cpp b/test/test_beta_dist_pdf_nvrtc_float.cpp new file mode 100644 index 0000000000..0b4fd83488 --- /dev/null +++ b/test/test_beta_dist_pdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_beta_dist_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::beta_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_beta_dist_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_beta_dist_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_beta_dist_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::beta_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_beta_dist_quan_double.cu b/test/test_beta_dist_quan_double.cu new file mode 100644 index 0000000000..101526afae --- /dev/null +++ b/test/test_beta_dist_quan_double.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = quantile(boost::math::beta_distribution(), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 512; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(quantile(boost::math::beta_distribution(), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_beta_dist_quan_float.cu b/test/test_beta_dist_quan_float.cu new file mode 100644 index 0000000000..77696c6393 --- /dev/null +++ b/test/test_beta_dist_quan_float.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = quantile(boost::math::beta_distribution(), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 512; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(quantile(boost::math::beta_distribution(), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_beta_dist_quan_nvrtc_double.cpp b/test/test_beta_dist_quan_nvrtc_double.cpp new file mode 100644 index 0000000000..9726bf019e --- /dev/null +++ b/test/test_beta_dist_quan_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_beta_dist_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::beta_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_beta_dist_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_beta_dist_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_beta_dist_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::beta_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_beta_dist_quan_nvrtc_float.cpp b/test/test_beta_dist_quan_nvrtc_float.cpp new file mode 100644 index 0000000000..d2476cb2ac --- /dev/null +++ b/test/test_beta_dist_quan_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_beta_dist_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::beta_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_beta_dist_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_beta_dist_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_beta_dist_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::beta_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_betac_double.cu b/test/test_betac_double.cu new file mode 100644 index 0000000000..8bb31d3219 --- /dev/null +++ b/test/test_betac_double.cu @@ -0,0 +1,146 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::betac(in1[i], in2[i], in3[i]); + } +} + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "beta_med_data.ipp" +#include "beta_small_data.ipp" + +/** + * Host main routine + */ +int main(void) +{ + try{ + // Consolidate the test data: + std::vector v1, v2, v3; + + for(unsigned i = 0; i < beta_med_data.size(); ++i) + { + v1.push_back(beta_med_data[i][0]); + v2.push_back(beta_med_data[i][1]); + v3.push_back(beta_med_data[i][2]); + } + for(unsigned i = 0; i < beta_small_data.size(); ++i) + { + v1.push_back(beta_small_data[i][0]); + v2.push_back(beta_small_data[i][1]); + v3.push_back(beta_small_data[i][2]); + } + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr input_vector3(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + int table_id = i % v1.size(); + input_vector1[i] = v1[table_id]; + input_vector2[i] = v2[table_id]; + input_vector3[i] = v3[table_id]; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::betac(input_vector1[i], input_vector2[i], input_vector3[i])); + double t = w.elapsed(); + bool failed = false; + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::isfinite(output_vector[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + failed = true; + } + } + } + + if (failed) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} + + diff --git a/test/test_betac_float.cu b/test/test_betac_float.cu new file mode 100644 index 0000000000..7070c567cc --- /dev/null +++ b/test/test_betac_float.cu @@ -0,0 +1,146 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::betac(in1[i], in2[i], in3[i]); + } +} + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "beta_med_data.ipp" +#include "beta_small_data.ipp" + +/** + * Host main routine + */ +int main(void) +{ + try{ + // Consolidate the test data: + std::vector v1, v2, v3; + + for(unsigned i = 0; i < beta_med_data.size(); ++i) + { + v1.push_back(beta_med_data[i][0]); + v2.push_back(beta_med_data[i][1]); + v3.push_back(beta_med_data[i][2]); + } + for(unsigned i = 0; i < beta_small_data.size(); ++i) + { + v1.push_back(beta_small_data[i][0]); + v2.push_back(beta_small_data[i][1]); + v3.push_back(beta_small_data[i][2]); + } + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr input_vector3(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + int table_id = i % v1.size(); + input_vector1[i] = v1[table_id]; + input_vector2[i] = v2[table_id]; + input_vector3[i] = v3[table_id]; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::betac(input_vector1[i], input_vector2[i], input_vector3[i])); + double t = w.elapsed(); + bool failed = false; + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::isfinite(output_vector[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + failed = true; + } + } + } + + if (failed) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} + + diff --git a/test/test_betac_nvrtc_double.cpp b/test/test_betac_nvrtc_double.cpp new file mode 100644 index 0000000000..0667cfe0d4 --- /dev/null +++ b/test/test_betac_nvrtc_double.cpp @@ -0,0 +1,196 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_beta_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::betac(in1[i], in2[i], in3[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_beta_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_beta_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_beta_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_in3, *h_out; + float_type *d_in1, *d_in2, *d_in3, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_in3 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + h_in3[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::betac(h_in1[i], h_in2[i], h_in3[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_in3); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_in3; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_betac_nvrtc_float.cpp b/test/test_betac_nvrtc_float.cpp new file mode 100644 index 0000000000..0667cfe0d4 --- /dev/null +++ b/test/test_betac_nvrtc_float.cpp @@ -0,0 +1,196 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_beta_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::betac(in1[i], in2[i], in3[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_beta_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_beta_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_beta_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_in3, *h_out; + float_type *d_in1, *d_in2, *d_in3, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_in3 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + h_in3[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::betac(h_in1[i], h_in2[i], h_in3[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_in3); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_in3; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_erf.cpp b/test/test_erf.cpp index 5044847114..2232c1c759 100644 --- a/test/test_erf.cpp +++ b/test/test_erf.cpp @@ -13,7 +13,7 @@ # pragma clang diagnostic ignored "-Wliteral-range" #elif defined(__GNUC__) # pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wliteral-range" +# pragma GCC diagnostic ignored "-Woverflow" #endif #define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error diff --git a/test/test_holtsmark.cpp b/test/test_holtsmark.cpp index 475f5400aa..93a40924d6 100644 --- a/test/test_holtsmark.cpp +++ b/test/test_holtsmark.cpp @@ -21,7 +21,7 @@ # pragma clang diagnostic ignored "-Wliteral-range" #elif defined(__GNUC__) # pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wliteral-range" +# pragma GCC diagnostic ignored "-Woverflow" #endif using boost::math::holtsmark_distribution; diff --git a/test/test_ibeta.cpp b/test/test_ibeta.cpp index e026ac6c52..987b361105 100644 --- a/test/test_ibeta.cpp +++ b/test/test_ibeta.cpp @@ -3,7 +3,18 @@ // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef SYCL_LANGUAGE_VERSION #include +#endif + +#ifdef __clang__ +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wliteral-range" +#elif defined(__GNUC__) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Woverflow" +#endif + #include "test_ibeta.hpp" #if !defined(TEST_FLOAT) && !defined(TEST_DOUBLE) && !defined(TEST_LDOUBLE) && !defined(TEST_REAL_CONCEPT) diff --git a/test/test_ibeta.hpp b/test/test_ibeta.hpp index 7c951d614f..cfd5d78cd1 100644 --- a/test/test_ibeta.hpp +++ b/test/test_ibeta.hpp @@ -8,9 +8,10 @@ #define BOOST_TEST_MAIN #include #include +#include #include #include -#include +#include "../include_private/boost/math/tools/test.hpp" #include #include #include diff --git a/test/test_ibeta_derivative.cpp b/test/test_ibeta_derivative.cpp index c899c94bf5..5d6a312754 100644 --- a/test/test_ibeta_derivative.cpp +++ b/test/test_ibeta_derivative.cpp @@ -4,7 +4,7 @@ // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) #if defined(__GNUC__) && __GNUC__ <= 12 #pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wliteral-range" +#pragma GCC diagnostic ignored "-Woverflow" #endif #include #include "test_ibeta_derivative.hpp" diff --git a/test/test_ibeta_derivative_double.cu b/test/test_ibeta_derivative_double.cu new file mode 100644 index 0000000000..e5f7f340ba --- /dev/null +++ b/test/test_ibeta_derivative_double.cu @@ -0,0 +1,149 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// floating-point value does not fit in required floating-point type +#pragma nv_diag_suppress 221 + +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ibeta_derivative(in1[i], in2[i], in3[i]); + } +} + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +/** + * Host main routine + */ +int main(void) +{ + try{ + // Consolidate the test data: + std::vector v1, v2, v3; + + for(unsigned i = 0; i < ibeta_data.size(); ++i) + { + v1.push_back(ibeta_data[i][0]); + v2.push_back(ibeta_data[i][1]); + v3.push_back(ibeta_data[i][2]); + } + for(unsigned i = 0; i < ibeta_small_data.size(); ++i) + { + v1.push_back(ibeta_small_data[i][0]); + v2.push_back(ibeta_small_data[i][1]); + v3.push_back(ibeta_small_data[i][2]); + } + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr input_vector3(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + int table_id = i % v1.size(); + input_vector1[i] = v1[table_id]; + input_vector2[i] = v2[table_id]; + input_vector3[i] = v3[table_id]; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ibeta_derivative(input_vector1[i], input_vector2[i], input_vector3[i])); + double t = w.elapsed(); + bool failed = false; + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::isfinite(output_vector[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + failed = true; + } + } + } + + if (failed) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} + + diff --git a/test/test_ibeta_derivative_float.cu b/test/test_ibeta_derivative_float.cu new file mode 100644 index 0000000000..36a79665d4 --- /dev/null +++ b/test/test_ibeta_derivative_float.cu @@ -0,0 +1,149 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// floating-point value does not fit in required floating-point type +#pragma nv_diag_suppress 221 + +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ibeta_derivative(in1[i], in2[i], in3[i]); + } +} + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +/** + * Host main routine + */ +int main(void) +{ + try{ + // Consolidate the test data: + std::vector v1, v2, v3; + + for(unsigned i = 0; i < ibeta_data.size(); ++i) + { + v1.push_back(ibeta_data[i][0]); + v2.push_back(ibeta_data[i][1]); + v3.push_back(ibeta_data[i][2]); + } + for(unsigned i = 0; i < ibeta_small_data.size(); ++i) + { + v1.push_back(ibeta_small_data[i][0]); + v2.push_back(ibeta_small_data[i][1]); + v3.push_back(ibeta_small_data[i][2]); + } + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr input_vector3(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + int table_id = i % v1.size(); + input_vector1[i] = v1[table_id]; + input_vector2[i] = v2[table_id]; + input_vector3[i] = v3[table_id]; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ibeta_derivative(input_vector1[i], input_vector2[i], input_vector3[i])); + double t = w.elapsed(); + bool failed = false; + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::isfinite(output_vector[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + failed = true; + } + } + } + + if (failed) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} + + diff --git a/test/test_ibeta_derivative_nvrtc_double.cpp b/test/test_ibeta_derivative_nvrtc_double.cpp new file mode 100644 index 0000000000..f15d21db00 --- /dev/null +++ b/test/test_ibeta_derivative_nvrtc_double.cpp @@ -0,0 +1,207 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_ibeta_derivative_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ibeta_derivative(in1[i], in2[i], in3[i]); + } +} +)"; + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_derivative_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ibeta_derivative_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_derivative_kernel"), "Failed to get kernel function"); + + int numElements = ibeta_data.size() + ibeta_small_data.size(); + float_type *h_in1, *h_in2, *h_in3, *h_out; + float_type *d_in1, *d_in2, *d_in3, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_in3 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + for (int i = 0; i < ibeta_data.size(); ++i) + { + h_in1[i] = ibeta_data[i][0]; + h_in2[i] = ibeta_data[i][1]; + h_in3[i] = ibeta_data[i][2]; + } + for (int i = 0; i < ibeta_small_data.size(); ++i) + { + h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0]; + h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1]; + h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2]; + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::ibeta_derivative(h_in1[i], h_in2[i], h_in3[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_in3); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_in3; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_ibeta_derivative_nvrtc_float.cpp b/test/test_ibeta_derivative_nvrtc_float.cpp new file mode 100644 index 0000000000..17443e0bdc --- /dev/null +++ b/test/test_ibeta_derivative_nvrtc_float.cpp @@ -0,0 +1,207 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_ibeta_derivative_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ibeta_derivative(in1[i], in2[i], in3[i]); + } +} +)"; + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_derivative_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ibeta_derivative_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_derivative_kernel"), "Failed to get kernel function"); + + int numElements = ibeta_data.size() + ibeta_small_data.size(); + float_type *h_in1, *h_in2, *h_in3, *h_out; + float_type *d_in1, *d_in2, *d_in3, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_in3 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + for (int i = 0; i < ibeta_data.size(); ++i) + { + h_in1[i] = ibeta_data[i][0]; + h_in2[i] = ibeta_data[i][1]; + h_in3[i] = ibeta_data[i][2]; + } + for (int i = 0; i < ibeta_small_data.size(); ++i) + { + h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0]; + h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1]; + h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2]; + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::ibeta_derivative(h_in1[i], h_in2[i], h_in3[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_in3); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_in3; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_ibeta_double.cu b/test/test_ibeta_double.cu new file mode 100644 index 0000000000..20384bf25f --- /dev/null +++ b/test/test_ibeta_double.cu @@ -0,0 +1,149 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// floating-point value does not fit in required floating-point type +#pragma nv_diag_suppress 221 + +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ibeta(in1[i], in2[i], in3[i]); + } +} + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +/** + * Host main routine + */ +int main(void) +{ + try{ + // Consolidate the test data: + std::vector v1, v2, v3; + + for(unsigned i = 0; i < ibeta_data.size(); ++i) + { + v1.push_back(ibeta_data[i][0]); + v2.push_back(ibeta_data[i][1]); + v3.push_back(ibeta_data[i][2]); + } + for(unsigned i = 0; i < ibeta_small_data.size(); ++i) + { + v1.push_back(ibeta_small_data[i][0]); + v2.push_back(ibeta_small_data[i][1]); + v3.push_back(ibeta_small_data[i][2]); + } + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr input_vector3(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + int table_id = i % v1.size(); + input_vector1[i] = v1[table_id]; + input_vector2[i] = v2[table_id]; + input_vector3[i] = v3[table_id]; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ibeta(input_vector1[i], input_vector2[i], input_vector3[i])); + double t = w.elapsed(); + bool failed = false; + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::isfinite(output_vector[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + failed = true; + } + } + } + + if (failed) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} + + diff --git a/test/test_ibeta_float.cu b/test/test_ibeta_float.cu new file mode 100644 index 0000000000..be17813ee4 --- /dev/null +++ b/test/test_ibeta_float.cu @@ -0,0 +1,149 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// floating-point value does not fit in required floating-point type +#pragma nv_diag_suppress 221 + +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ibeta(in1[i], in2[i], in3[i]); + } +} + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +/** + * Host main routine + */ +int main(void) +{ + try{ + // Consolidate the test data: + std::vector v1, v2, v3; + + for(unsigned i = 0; i < ibeta_data.size(); ++i) + { + v1.push_back(ibeta_data[i][0]); + v2.push_back(ibeta_data[i][1]); + v3.push_back(ibeta_data[i][2]); + } + for(unsigned i = 0; i < ibeta_small_data.size(); ++i) + { + v1.push_back(ibeta_small_data[i][0]); + v2.push_back(ibeta_small_data[i][1]); + v3.push_back(ibeta_small_data[i][2]); + } + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr input_vector3(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + int table_id = i % v1.size(); + input_vector1[i] = v1[table_id]; + input_vector2[i] = v2[table_id]; + input_vector3[i] = v3[table_id]; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ibeta(input_vector1[i], input_vector2[i], input_vector3[i])); + double t = w.elapsed(); + bool failed = false; + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::isfinite(output_vector[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + failed = true; + } + } + } + + if (failed) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} + + diff --git a/test/test_ibeta_inv.cpp b/test/test_ibeta_inv.cpp index 218c1625e8..ab1f4267fc 100644 --- a/test/test_ibeta_inv.cpp +++ b/test/test_ibeta_inv.cpp @@ -3,7 +3,18 @@ // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef SYCL_LANGUAGE_VERSION #include +#endif + +#ifdef __clang__ +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wliteral-range" +#elif defined(__GNUC__) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Woverflow" +#endif + #include"test_ibeta_inv.hpp" #if !defined(TEST_FLOAT) && !defined(TEST_DOUBLE) && !defined(TEST_LDOUBLE) && !defined(TEST_REAL_CONCEPT) diff --git a/test/test_ibeta_inv.hpp b/test/test_ibeta_inv.hpp index ba98901773..fa765b2ef8 100644 --- a/test/test_ibeta_inv.hpp +++ b/test/test_ibeta_inv.hpp @@ -8,10 +8,11 @@ #define BOOST_TEST_MAIN #include #include +#include #include // for has_denorm_now #include #include -#include +#include "../include_private/boost/math/tools/test.hpp" #include #include #include @@ -306,6 +307,7 @@ void test_spots(T) BOOST_MATH_CHECK_THROW(::boost::math::ibeta_inv(static_cast(2.125), -n, static_cast(0.125)), std::domain_error); BOOST_MATH_CHECK_THROW(::boost::math::ibeta_inv(static_cast(2.125), static_cast(1.125), -n), std::domain_error); } + #ifndef SYCL_LANGUAGE_VERSION if (boost::math::detail::has_denorm_now()) { T m = std::numeric_limits::denorm_min(); @@ -317,5 +319,6 @@ void test_spots(T) BOOST_CHECK((boost::math::isfinite)(boost::math::ibeta_inv(static_cast(12.125), m, static_cast(0.125)))); BOOST_CHECK((boost::math::isfinite)(boost::math::ibeta_inv(m, m, static_cast(0.125)))); } + #endif } diff --git a/test/test_ibeta_inv_ab.cpp b/test/test_ibeta_inv_ab.cpp index c1acb2d1ca..fdf735ef1e 100644 --- a/test/test_ibeta_inv_ab.cpp +++ b/test/test_ibeta_inv_ab.cpp @@ -3,7 +3,18 @@ // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef SYCL_LANGUAGE_VERSION #include +#endif + +#ifdef __clang__ +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wliteral-range" +#elif defined(__GNUC__) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Woverflow" +#endif + #include "test_ibeta_inv_ab.hpp" #if !defined(TEST_FLOAT) && !defined(TEST_DOUBLE) && !defined(TEST_LDOUBLE) && !defined(TEST_REAL_CONCEPT) diff --git a/test/test_ibeta_inv_ab.hpp b/test/test_ibeta_inv_ab.hpp index c378d15287..b91ab5261d 100644 --- a/test/test_ibeta_inv_ab.hpp +++ b/test/test_ibeta_inv_ab.hpp @@ -10,9 +10,10 @@ #define BOOST_TEST_MAIN #include #include +#include #include #include -#include +#include "../include_private/boost/math/tools/test.hpp" #include #include #include diff --git a/test/test_ibeta_inv_double.cu b/test/test_ibeta_inv_double.cu new file mode 100644 index 0000000000..ef62c5e162 --- /dev/null +++ b/test/test_ibeta_inv_double.cu @@ -0,0 +1,149 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// floating-point value does not fit in required floating-point type +#pragma nv_diag_suppress 221 + +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ibeta_inv(in1[i], in2[i], in3[i]); + } +} + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +/** + * Host main routine + */ +int main(void) +{ + try{ + // Consolidate the test data: + std::vector v1, v2, v3; + + for(unsigned i = 0; i < ibeta_data.size(); ++i) + { + v1.push_back(ibeta_data[i][0]); + v2.push_back(ibeta_data[i][1]); + v3.push_back(ibeta_data[i][2]); + } + for(unsigned i = 0; i < ibeta_small_data.size(); ++i) + { + v1.push_back(ibeta_small_data[i][0]); + v2.push_back(ibeta_small_data[i][1]); + v3.push_back(ibeta_small_data[i][2]); + } + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr input_vector3(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + int table_id = i % v1.size(); + input_vector1[i] = v1[table_id]; + input_vector2[i] = v2[table_id]; + input_vector3[i] = v3[table_id]; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ibeta_inv(input_vector1[i], input_vector2[i], input_vector3[i])); + double t = w.elapsed(); + bool failed = false; + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::isfinite(output_vector[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + failed = true; + } + } + } + + if (failed) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} + + diff --git a/test/test_ibeta_inv_float.cu b/test/test_ibeta_inv_float.cu new file mode 100644 index 0000000000..a0d48bfbda --- /dev/null +++ b/test/test_ibeta_inv_float.cu @@ -0,0 +1,149 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// floating-point value does not fit in required floating-point type +#pragma nv_diag_suppress 221 + +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ibeta_inv(in1[i], in2[i], in3[i]); + } +} + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +/** + * Host main routine + */ +int main(void) +{ + try{ + // Consolidate the test data: + std::vector v1, v2, v3; + + for(unsigned i = 0; i < ibeta_data.size(); ++i) + { + v1.push_back(ibeta_data[i][0]); + v2.push_back(ibeta_data[i][1]); + v3.push_back(ibeta_data[i][2]); + } + for(unsigned i = 0; i < ibeta_small_data.size(); ++i) + { + v1.push_back(ibeta_small_data[i][0]); + v2.push_back(ibeta_small_data[i][1]); + v3.push_back(ibeta_small_data[i][2]); + } + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr input_vector3(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + int table_id = i % v1.size(); + input_vector1[i] = v1[table_id]; + input_vector2[i] = v2[table_id]; + input_vector3[i] = v3[table_id]; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ibeta_inv(input_vector1[i], input_vector2[i], input_vector3[i])); + double t = w.elapsed(); + bool failed = false; + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::isfinite(output_vector[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + failed = true; + } + } + } + + if (failed) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} + + diff --git a/test/test_ibeta_inv_nvrtc_double.cpp b/test/test_ibeta_inv_nvrtc_double.cpp new file mode 100644 index 0000000000..2f01012bbe --- /dev/null +++ b/test/test_ibeta_inv_nvrtc_double.cpp @@ -0,0 +1,207 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_ibeta_inv_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ibeta_inv(in1[i], in2[i], in3[i]); + } +} +)"; + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_inv_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ibeta_inv_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_inv_kernel"), "Failed to get kernel function"); + + int numElements = ibeta_data.size() + ibeta_small_data.size(); + float_type *h_in1, *h_in2, *h_in3, *h_out; + float_type *d_in1, *d_in2, *d_in3, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_in3 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + for (int i = 0; i < ibeta_data.size(); ++i) + { + h_in1[i] = ibeta_data[i][0]; + h_in2[i] = ibeta_data[i][1]; + h_in3[i] = ibeta_data[i][2]; + } + for (int i = 0; i < ibeta_small_data.size(); ++i) + { + h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0]; + h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1]; + h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2]; + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::ibeta_inv(h_in1[i], h_in2[i], h_in3[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_in3); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_in3; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_ibeta_inv_nvrtc_float.cpp b/test/test_ibeta_inv_nvrtc_float.cpp new file mode 100644 index 0000000000..5d804398cb --- /dev/null +++ b/test/test_ibeta_inv_nvrtc_float.cpp @@ -0,0 +1,207 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_ibeta_inv_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ibeta_inv(in1[i], in2[i], in3[i]); + } +} +)"; + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_inv_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ibeta_inv_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_inv_kernel"), "Failed to get kernel function"); + + int numElements = ibeta_data.size() + ibeta_small_data.size(); + float_type *h_in1, *h_in2, *h_in3, *h_out; + float_type *d_in1, *d_in2, *d_in3, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_in3 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + for (int i = 0; i < ibeta_data.size(); ++i) + { + h_in1[i] = ibeta_data[i][0]; + h_in2[i] = ibeta_data[i][1]; + h_in3[i] = ibeta_data[i][2]; + } + for (int i = 0; i < ibeta_small_data.size(); ++i) + { + h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0]; + h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1]; + h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2]; + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::ibeta_inv(h_in1[i], h_in2[i], h_in3[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_in3); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_in3; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_ibeta_inva_double.cu b/test/test_ibeta_inva_double.cu new file mode 100644 index 0000000000..7783eb21bb --- /dev/null +++ b/test/test_ibeta_inva_double.cu @@ -0,0 +1,149 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// floating-point value does not fit in required floating-point type +#pragma nv_diag_suppress 221 + +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ibeta_inva(in1[i], in2[i], in3[i]); + } +} + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +/** + * Host main routine + */ +int main(void) +{ + try{ + // Consolidate the test data: + std::vector v1, v2, v3; + + for(unsigned i = 0; i < ibeta_data.size(); ++i) + { + v1.push_back(ibeta_data[i][0]); + v2.push_back(ibeta_data[i][1]); + v3.push_back(ibeta_data[i][2]); + } + for(unsigned i = 0; i < ibeta_small_data.size(); ++i) + { + v1.push_back(ibeta_small_data[i][0]); + v2.push_back(ibeta_small_data[i][1]); + v3.push_back(ibeta_small_data[i][2]); + } + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr input_vector3(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + int table_id = i % v1.size(); + input_vector1[i] = v1[table_id]; + input_vector2[i] = v2[table_id]; + input_vector3[i] = v3[table_id]; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ibeta_inva(input_vector1[i], input_vector2[i], input_vector3[i])); + double t = w.elapsed(); + bool failed = false; + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::isfinite(output_vector[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + failed = true; + } + } + } + + if (failed) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} + + diff --git a/test/test_ibeta_inva_float.cu b/test/test_ibeta_inva_float.cu new file mode 100644 index 0000000000..ff918f9436 --- /dev/null +++ b/test/test_ibeta_inva_float.cu @@ -0,0 +1,149 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// floating-point value does not fit in required floating-point type +#pragma nv_diag_suppress 221 + +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ibeta_inva(in1[i], in2[i], in3[i]); + } +} + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +/** + * Host main routine + */ +int main(void) +{ + try{ + // Consolidate the test data: + std::vector v1, v2, v3; + + for(unsigned i = 0; i < ibeta_data.size(); ++i) + { + v1.push_back(ibeta_data[i][0]); + v2.push_back(ibeta_data[i][1]); + v3.push_back(ibeta_data[i][2]); + } + for(unsigned i = 0; i < ibeta_small_data.size(); ++i) + { + v1.push_back(ibeta_small_data[i][0]); + v2.push_back(ibeta_small_data[i][1]); + v3.push_back(ibeta_small_data[i][2]); + } + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr input_vector3(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + int table_id = i % v1.size(); + input_vector1[i] = v1[table_id]; + input_vector2[i] = v2[table_id]; + input_vector3[i] = v3[table_id]; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ibeta_inva(input_vector1[i], input_vector2[i], input_vector3[i])); + double t = w.elapsed(); + bool failed = false; + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::isfinite(output_vector[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + failed = true; + } + } + } + + if (failed) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} + + diff --git a/test/test_ibeta_inva_nvrtc_double.cpp b/test/test_ibeta_inva_nvrtc_double.cpp new file mode 100644 index 0000000000..a392eaea65 --- /dev/null +++ b/test/test_ibeta_inva_nvrtc_double.cpp @@ -0,0 +1,220 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false +typedef double float_type; +#include +#include +extern "C" __global__ +void test_ibeta_inva_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ibeta_inva(in1[i], in2[i], in3[i]); + } +} +)"; + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_inva_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ibeta_inva_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_inva_kernel"), "Failed to get kernel function"); + + int numElements = ibeta_data.size() + ibeta_small_data.size(); + float_type *h_in1, *h_in2, *h_in3, *h_out; + float_type *d_in1, *d_in2, *d_in3, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_in3 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + for (int i = 0; i < ibeta_data.size(); ++i) + { + h_in1[i] = ibeta_data[i][0]; + h_in2[i] = ibeta_data[i][1]; + h_in3[i] = ibeta_data[i][2]; + } + for (int i = 0; i < ibeta_small_data.size(); ++i) + { + h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0]; + h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1]; + h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2]; + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + // Sometimes the ignore error policy is ignored so the below throws + // Rather than terminating we can continue to process through our results array + double res; + try + { + res = boost::math::ibeta_inva(h_in1[i], h_in2[i], h_in3[i]); + } + catch (...) + { + continue; + } + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_in3); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_in3; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_ibeta_inva_nvrtc_float.cpp b/test/test_ibeta_inva_nvrtc_float.cpp new file mode 100644 index 0000000000..ba5745c321 --- /dev/null +++ b/test/test_ibeta_inva_nvrtc_float.cpp @@ -0,0 +1,220 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false +typedef float float_type; +#include +#include +extern "C" __global__ +void test_ibeta_inva_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ibeta_inva(in1[i], in2[i], in3[i]); + } +} +)"; + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_inva_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ibeta_inva_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_inva_kernel"), "Failed to get kernel function"); + + int numElements = ibeta_data.size() + ibeta_small_data.size(); + float_type *h_in1, *h_in2, *h_in3, *h_out; + float_type *d_in1, *d_in2, *d_in3, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_in3 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + for (int i = 0; i < ibeta_data.size(); ++i) + { + h_in1[i] = ibeta_data[i][0]; + h_in2[i] = ibeta_data[i][1]; + h_in3[i] = ibeta_data[i][2]; + } + for (int i = 0; i < ibeta_small_data.size(); ++i) + { + h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0]; + h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1]; + h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2]; + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + // Sometimes the ignore error policy is ignored so the below throws + // Rather than terminating we can continue to process through our results array + double res; + try + { + res = boost::math::ibeta_inva(h_in1[i], h_in2[i], h_in3[i]); + } + catch (...) + { + continue; + } + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_in3); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_in3; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_ibeta_invb_double.cu b/test/test_ibeta_invb_double.cu new file mode 100644 index 0000000000..562f5349dd --- /dev/null +++ b/test/test_ibeta_invb_double.cu @@ -0,0 +1,149 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// floating-point value does not fit in required floating-point type +#pragma nv_diag_suppress 221 + +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ibeta_invb(in1[i], in2[i], in3[i]); + } +} + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +/** + * Host main routine + */ +int main(void) +{ + try{ + // Consolidate the test data: + std::vector v1, v2, v3; + + for(unsigned i = 0; i < ibeta_data.size(); ++i) + { + v1.push_back(ibeta_data[i][0]); + v2.push_back(ibeta_data[i][1]); + v3.push_back(ibeta_data[i][2]); + } + for(unsigned i = 0; i < ibeta_small_data.size(); ++i) + { + v1.push_back(ibeta_small_data[i][0]); + v2.push_back(ibeta_small_data[i][1]); + v3.push_back(ibeta_small_data[i][2]); + } + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr input_vector3(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + int table_id = i % v1.size(); + input_vector1[i] = v1[table_id]; + input_vector2[i] = v2[table_id]; + input_vector3[i] = v3[table_id]; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ibeta_invb(input_vector1[i], input_vector2[i], input_vector3[i])); + double t = w.elapsed(); + bool failed = false; + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::isfinite(output_vector[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + failed = true; + } + } + } + + if (failed) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} + + diff --git a/test/test_ibeta_invb_float.cu b/test/test_ibeta_invb_float.cu new file mode 100644 index 0000000000..86f5615c36 --- /dev/null +++ b/test/test_ibeta_invb_float.cu @@ -0,0 +1,149 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// floating-point value does not fit in required floating-point type +#pragma nv_diag_suppress 221 + +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ibeta_invb(in1[i], in2[i], in3[i]); + } +} + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +/** + * Host main routine + */ +int main(void) +{ + try{ + // Consolidate the test data: + std::vector v1, v2, v3; + + for(unsigned i = 0; i < ibeta_data.size(); ++i) + { + v1.push_back(ibeta_data[i][0]); + v2.push_back(ibeta_data[i][1]); + v3.push_back(ibeta_data[i][2]); + } + for(unsigned i = 0; i < ibeta_small_data.size(); ++i) + { + v1.push_back(ibeta_small_data[i][0]); + v2.push_back(ibeta_small_data[i][1]); + v3.push_back(ibeta_small_data[i][2]); + } + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr input_vector3(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + int table_id = i % v1.size(); + input_vector1[i] = v1[table_id]; + input_vector2[i] = v2[table_id]; + input_vector3[i] = v3[table_id]; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ibeta_invb(input_vector1[i], input_vector2[i], input_vector3[i])); + double t = w.elapsed(); + bool failed = false; + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::isfinite(output_vector[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + failed = true; + } + } + } + + if (failed) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} + + diff --git a/test/test_ibeta_invb_nvrtc_double.cpp b/test/test_ibeta_invb_nvrtc_double.cpp new file mode 100644 index 0000000000..6f046f09f3 --- /dev/null +++ b/test/test_ibeta_invb_nvrtc_double.cpp @@ -0,0 +1,220 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false +typedef double float_type; +#include +#include +extern "C" __global__ +void test_ibeta_invb_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ibeta_invb(in1[i], in2[i], in3[i]); + } +} +)"; + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_invb_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ibeta_invb_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_invb_kernel"), "Failed to get kernel function"); + + int numElements = ibeta_data.size() + ibeta_small_data.size(); + float_type *h_in1, *h_in2, *h_in3, *h_out; + float_type *d_in1, *d_in2, *d_in3, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_in3 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + for (int i = 0; i < ibeta_data.size(); ++i) + { + h_in1[i] = ibeta_data[i][0]; + h_in2[i] = ibeta_data[i][1]; + h_in3[i] = ibeta_data[i][2]; + } + for (int i = 0; i < ibeta_small_data.size(); ++i) + { + h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0]; + h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1]; + h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2]; + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + // Sometimes the ignore error policy is ignored so the below throws + // Rather than terminating we can continue to process through our results array + double res; + try + { + res = boost::math::ibeta_invb(h_in1[i], h_in2[i], h_in3[i]); + } + catch (...) + { + continue; + } + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_in3); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_in3; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_ibeta_invb_nvrtc_float.cpp b/test/test_ibeta_invb_nvrtc_float.cpp new file mode 100644 index 0000000000..f2d17b8447 --- /dev/null +++ b/test/test_ibeta_invb_nvrtc_float.cpp @@ -0,0 +1,220 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false +typedef float float_type; +#include +#include +extern "C" __global__ +void test_ibeta_invb_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ibeta_invb(in1[i], in2[i], in3[i]); + } +} +)"; + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_invb_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ibeta_invb_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_invb_kernel"), "Failed to get kernel function"); + + int numElements = ibeta_data.size() + ibeta_small_data.size(); + float_type *h_in1, *h_in2, *h_in3, *h_out; + float_type *d_in1, *d_in2, *d_in3, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_in3 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + for (int i = 0; i < ibeta_data.size(); ++i) + { + h_in1[i] = ibeta_data[i][0]; + h_in2[i] = ibeta_data[i][1]; + h_in3[i] = ibeta_data[i][2]; + } + for (int i = 0; i < ibeta_small_data.size(); ++i) + { + h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0]; + h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1]; + h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2]; + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + // Sometimes the ignore error policy is ignored so the below throws + // Rather than terminating we can continue to process through our results array + double res; + try + { + res = boost::math::ibeta_invb(h_in1[i], h_in2[i], h_in3[i]); + } + catch (...) + { + continue; + } + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_in3); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_in3; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_ibeta_nvrtc_double.cpp b/test/test_ibeta_nvrtc_double.cpp new file mode 100644 index 0000000000..bc920b6368 --- /dev/null +++ b/test/test_ibeta_nvrtc_double.cpp @@ -0,0 +1,207 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_ibeta_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ibeta(in1[i], in2[i], in3[i]); + } +} +)"; + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ibeta_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_kernel"), "Failed to get kernel function"); + + int numElements = ibeta_data.size() + ibeta_small_data.size(); + float_type *h_in1, *h_in2, *h_in3, *h_out; + float_type *d_in1, *d_in2, *d_in3, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_in3 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + for (int i = 0; i < ibeta_data.size(); ++i) + { + h_in1[i] = ibeta_data[i][0]; + h_in2[i] = ibeta_data[i][1]; + h_in3[i] = ibeta_data[i][2]; + } + for (int i = 0; i < ibeta_small_data.size(); ++i) + { + h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0]; + h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1]; + h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2]; + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::ibeta(h_in1[i], h_in2[i], h_in3[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_in3); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_in3; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_ibeta_nvrtc_float.cpp b/test/test_ibeta_nvrtc_float.cpp new file mode 100644 index 0000000000..ee15748628 --- /dev/null +++ b/test/test_ibeta_nvrtc_float.cpp @@ -0,0 +1,207 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_ibeta_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ibeta(in1[i], in2[i], in3[i]); + } +} +)"; + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ibeta_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_kernel"), "Failed to get kernel function"); + + int numElements = ibeta_data.size() + ibeta_small_data.size(); + float_type *h_in1, *h_in2, *h_in3, *h_out; + float_type *d_in1, *d_in2, *d_in3, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_in3 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + for (int i = 0; i < ibeta_data.size(); ++i) + { + h_in1[i] = ibeta_data[i][0]; + h_in2[i] = ibeta_data[i][1]; + h_in3[i] = ibeta_data[i][2]; + } + for (int i = 0; i < ibeta_small_data.size(); ++i) + { + h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0]; + h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1]; + h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2]; + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::ibeta(h_in1[i], h_in2[i], h_in3[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_in3); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_in3; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_ibetac_inv_double.cu b/test/test_ibetac_inv_double.cu new file mode 100644 index 0000000000..a983d16677 --- /dev/null +++ b/test/test_ibetac_inv_double.cu @@ -0,0 +1,149 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// floating-point value does not fit in required floating-point type +#pragma nv_diag_suppress 221 + +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ibetac_inv(in1[i], in2[i], in3[i]); + } +} + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +/** + * Host main routine + */ +int main(void) +{ + try{ + // Consolidate the test data: + std::vector v1, v2, v3; + + for(unsigned i = 0; i < ibeta_data.size(); ++i) + { + v1.push_back(ibeta_data[i][0]); + v2.push_back(ibeta_data[i][1]); + v3.push_back(ibeta_data[i][2]); + } + for(unsigned i = 0; i < ibeta_small_data.size(); ++i) + { + v1.push_back(ibeta_small_data[i][0]); + v2.push_back(ibeta_small_data[i][1]); + v3.push_back(ibeta_small_data[i][2]); + } + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr input_vector3(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + int table_id = i % v1.size(); + input_vector1[i] = v1[table_id]; + input_vector2[i] = v2[table_id]; + input_vector3[i] = v3[table_id]; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ibetac_inv(input_vector1[i], input_vector2[i], input_vector3[i])); + double t = w.elapsed(); + bool failed = false; + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::isfinite(output_vector[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + failed = true; + } + } + } + + if (failed) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} + + diff --git a/test/test_ibetac_inv_float.cu b/test/test_ibetac_inv_float.cu new file mode 100644 index 0000000000..94583b45e2 --- /dev/null +++ b/test/test_ibetac_inv_float.cu @@ -0,0 +1,149 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// floating-point value does not fit in required floating-point type +#pragma nv_diag_suppress 221 + +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ibetac_inv(in1[i], in2[i], in3[i]); + } +} + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +/** + * Host main routine + */ +int main(void) +{ + try{ + // Consolidate the test data: + std::vector v1, v2, v3; + + for(unsigned i = 0; i < ibeta_data.size(); ++i) + { + v1.push_back(ibeta_data[i][0]); + v2.push_back(ibeta_data[i][1]); + v3.push_back(ibeta_data[i][2]); + } + for(unsigned i = 0; i < ibeta_small_data.size(); ++i) + { + v1.push_back(ibeta_small_data[i][0]); + v2.push_back(ibeta_small_data[i][1]); + v3.push_back(ibeta_small_data[i][2]); + } + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr input_vector3(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + int table_id = i % v1.size(); + input_vector1[i] = v1[table_id]; + input_vector2[i] = v2[table_id]; + input_vector3[i] = v3[table_id]; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ibetac_inv(input_vector1[i], input_vector2[i], input_vector3[i])); + double t = w.elapsed(); + bool failed = false; + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::isfinite(output_vector[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + failed = true; + } + } + } + + if (failed) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} + + diff --git a/test/test_ibetac_inv_nvrtc_double.cpp b/test/test_ibetac_inv_nvrtc_double.cpp new file mode 100644 index 0000000000..a99d53b3cd --- /dev/null +++ b/test/test_ibetac_inv_nvrtc_double.cpp @@ -0,0 +1,207 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_ibetac_inv_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ibetac_inv(in1[i], in2[i], in3[i]); + } +} +)"; + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibetac_inv_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ibetac_inv_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ibetac_inv_kernel"), "Failed to get kernel function"); + + int numElements = ibeta_data.size() + ibeta_small_data.size(); + float_type *h_in1, *h_in2, *h_in3, *h_out; + float_type *d_in1, *d_in2, *d_in3, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_in3 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + for (int i = 0; i < ibeta_data.size(); ++i) + { + h_in1[i] = ibeta_data[i][0]; + h_in2[i] = ibeta_data[i][1]; + h_in3[i] = ibeta_data[i][2]; + } + for (int i = 0; i < ibeta_small_data.size(); ++i) + { + h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0]; + h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1]; + h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2]; + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::ibetac_inv(h_in1[i], h_in2[i], h_in3[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_in3); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_in3; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_ibetac_inv_nvrtc_float.cpp b/test/test_ibetac_inv_nvrtc_float.cpp new file mode 100644 index 0000000000..47e89db4c1 --- /dev/null +++ b/test/test_ibetac_inv_nvrtc_float.cpp @@ -0,0 +1,207 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_ibetac_inv_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ibetac_inv(in1[i], in2[i], in3[i]); + } +} +)"; + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibetac_inv_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ibetac_inv_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ibetac_inv_kernel"), "Failed to get kernel function"); + + int numElements = ibeta_data.size() + ibeta_small_data.size(); + float_type *h_in1, *h_in2, *h_in3, *h_out; + float_type *d_in1, *d_in2, *d_in3, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_in3 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + for (int i = 0; i < ibeta_data.size(); ++i) + { + h_in1[i] = ibeta_data[i][0]; + h_in2[i] = ibeta_data[i][1]; + h_in3[i] = ibeta_data[i][2]; + } + for (int i = 0; i < ibeta_small_data.size(); ++i) + { + h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0]; + h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1]; + h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2]; + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::ibetac_inv(h_in1[i], h_in2[i], h_in3[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_in3); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_in3; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_ibetac_inva_double.cu b/test/test_ibetac_inva_double.cu new file mode 100644 index 0000000000..2efbee265d --- /dev/null +++ b/test/test_ibetac_inva_double.cu @@ -0,0 +1,149 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// floating-point value does not fit in required floating-point type +#pragma nv_diag_suppress 221 + +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ibetac_inva(in1[i], in2[i], in3[i]); + } +} + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +/** + * Host main routine + */ +int main(void) +{ + try{ + // Consolidate the test data: + std::vector v1, v2, v3; + + for(unsigned i = 0; i < ibeta_data.size(); ++i) + { + v1.push_back(ibeta_data[i][0]); + v2.push_back(ibeta_data[i][1]); + v3.push_back(ibeta_data[i][2]); + } + for(unsigned i = 0; i < ibeta_small_data.size(); ++i) + { + v1.push_back(ibeta_small_data[i][0]); + v2.push_back(ibeta_small_data[i][1]); + v3.push_back(ibeta_small_data[i][2]); + } + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr input_vector3(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + int table_id = i % v1.size(); + input_vector1[i] = v1[table_id]; + input_vector2[i] = v2[table_id]; + input_vector3[i] = v3[table_id]; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ibetac_inva(input_vector1[i], input_vector2[i], input_vector3[i])); + double t = w.elapsed(); + bool failed = false; + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::isfinite(output_vector[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + failed = true; + } + } + } + + if (failed) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} + + diff --git a/test/test_ibetac_inva_float.cu b/test/test_ibetac_inva_float.cu new file mode 100644 index 0000000000..9bd1a29a07 --- /dev/null +++ b/test/test_ibetac_inva_float.cu @@ -0,0 +1,149 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// floating-point value does not fit in required floating-point type +#pragma nv_diag_suppress 221 + +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ibetac_inva(in1[i], in2[i], in3[i]); + } +} + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +/** + * Host main routine + */ +int main(void) +{ + try{ + // Consolidate the test data: + std::vector v1, v2, v3; + + for(unsigned i = 0; i < ibeta_data.size(); ++i) + { + v1.push_back(ibeta_data[i][0]); + v2.push_back(ibeta_data[i][1]); + v3.push_back(ibeta_data[i][2]); + } + for(unsigned i = 0; i < ibeta_small_data.size(); ++i) + { + v1.push_back(ibeta_small_data[i][0]); + v2.push_back(ibeta_small_data[i][1]); + v3.push_back(ibeta_small_data[i][2]); + } + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr input_vector3(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + int table_id = i % v1.size(); + input_vector1[i] = v1[table_id]; + input_vector2[i] = v2[table_id]; + input_vector3[i] = v3[table_id]; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ibetac_inva(input_vector1[i], input_vector2[i], input_vector3[i])); + double t = w.elapsed(); + bool failed = false; + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::isfinite(output_vector[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + failed = true; + } + } + } + + if (failed) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} + + diff --git a/test/test_ibetac_inva_nvrtc_double.cpp b/test/test_ibetac_inva_nvrtc_double.cpp new file mode 100644 index 0000000000..7c7bf992b3 --- /dev/null +++ b/test/test_ibetac_inva_nvrtc_double.cpp @@ -0,0 +1,220 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false +typedef double float_type; +#include +#include +extern "C" __global__ +void test_ibetac_inva_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ibetac_inva(in1[i], in2[i], in3[i]); + } +} +)"; + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibetac_inva_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ibetac_inva_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ibetac_inva_kernel"), "Failed to get kernel function"); + + int numElements = ibeta_data.size() + ibeta_small_data.size(); + float_type *h_in1, *h_in2, *h_in3, *h_out; + float_type *d_in1, *d_in2, *d_in3, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_in3 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + for (int i = 0; i < ibeta_data.size(); ++i) + { + h_in1[i] = ibeta_data[i][0]; + h_in2[i] = ibeta_data[i][1]; + h_in3[i] = ibeta_data[i][2]; + } + for (int i = 0; i < ibeta_small_data.size(); ++i) + { + h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0]; + h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1]; + h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2]; + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + // Sometimes the ignore error policy is ignored so the below throws + // Rather than terminating we can continue to process through our results array + double res; + try + { + res = boost::math::ibetac_inva(h_in1[i], h_in2[i], h_in3[i]); + } + catch (...) + { + continue; + } + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_in3); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_in3; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_ibetac_inva_nvrtc_float.cpp b/test/test_ibetac_inva_nvrtc_float.cpp new file mode 100644 index 0000000000..c79b8b02f1 --- /dev/null +++ b/test/test_ibetac_inva_nvrtc_float.cpp @@ -0,0 +1,220 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false +typedef float float_type; +#include +#include +extern "C" __global__ +void test_ibetac_inva_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ibetac_inva(in1[i], in2[i], in3[i]); + } +} +)"; + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibetac_inva_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ibetac_inva_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ibetac_inva_kernel"), "Failed to get kernel function"); + + int numElements = ibeta_data.size() + ibeta_small_data.size(); + float_type *h_in1, *h_in2, *h_in3, *h_out; + float_type *d_in1, *d_in2, *d_in3, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_in3 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + for (int i = 0; i < ibeta_data.size(); ++i) + { + h_in1[i] = ibeta_data[i][0]; + h_in2[i] = ibeta_data[i][1]; + h_in3[i] = ibeta_data[i][2]; + } + for (int i = 0; i < ibeta_small_data.size(); ++i) + { + h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0]; + h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1]; + h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2]; + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + // Sometimes the ignore error policy is ignored so the below throws + // Rather than terminating we can continue to process through our results array + double res; + try + { + res = boost::math::ibetac_inva(h_in1[i], h_in2[i], h_in3[i]); + } + catch (...) + { + continue; + } + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_in3); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_in3; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_ibetac_invb_double.cu b/test/test_ibetac_invb_double.cu new file mode 100644 index 0000000000..fddd655af2 --- /dev/null +++ b/test/test_ibetac_invb_double.cu @@ -0,0 +1,149 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// floating-point value does not fit in required floating-point type +#pragma nv_diag_suppress 221 + +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ibetac_invb(in1[i], in2[i], in3[i]); + } +} + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +/** + * Host main routine + */ +int main(void) +{ + try{ + // Consolidate the test data: + std::vector v1, v2, v3; + + for(unsigned i = 0; i < ibeta_data.size(); ++i) + { + v1.push_back(ibeta_data[i][0]); + v2.push_back(ibeta_data[i][1]); + v3.push_back(ibeta_data[i][2]); + } + for(unsigned i = 0; i < ibeta_small_data.size(); ++i) + { + v1.push_back(ibeta_small_data[i][0]); + v2.push_back(ibeta_small_data[i][1]); + v3.push_back(ibeta_small_data[i][2]); + } + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr input_vector3(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + int table_id = i % v1.size(); + input_vector1[i] = v1[table_id]; + input_vector2[i] = v2[table_id]; + input_vector3[i] = v3[table_id]; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ibetac_invb(input_vector1[i], input_vector2[i], input_vector3[i])); + double t = w.elapsed(); + bool failed = false; + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::isfinite(output_vector[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + failed = true; + } + } + } + + if (failed) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} + + diff --git a/test/test_ibetac_invb_float.cu b/test/test_ibetac_invb_float.cu new file mode 100644 index 0000000000..fddd655af2 --- /dev/null +++ b/test/test_ibetac_invb_float.cu @@ -0,0 +1,149 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// floating-point value does not fit in required floating-point type +#pragma nv_diag_suppress 221 + +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ibetac_invb(in1[i], in2[i], in3[i]); + } +} + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +/** + * Host main routine + */ +int main(void) +{ + try{ + // Consolidate the test data: + std::vector v1, v2, v3; + + for(unsigned i = 0; i < ibeta_data.size(); ++i) + { + v1.push_back(ibeta_data[i][0]); + v2.push_back(ibeta_data[i][1]); + v3.push_back(ibeta_data[i][2]); + } + for(unsigned i = 0; i < ibeta_small_data.size(); ++i) + { + v1.push_back(ibeta_small_data[i][0]); + v2.push_back(ibeta_small_data[i][1]); + v3.push_back(ibeta_small_data[i][2]); + } + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr input_vector3(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + int table_id = i % v1.size(); + input_vector1[i] = v1[table_id]; + input_vector2[i] = v2[table_id]; + input_vector3[i] = v3[table_id]; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ibetac_invb(input_vector1[i], input_vector2[i], input_vector3[i])); + double t = w.elapsed(); + bool failed = false; + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::isfinite(output_vector[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + failed = true; + } + } + } + + if (failed) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} + + diff --git a/test/test_ibetac_invb_nvrtc_double.cpp b/test/test_ibetac_invb_nvrtc_double.cpp new file mode 100644 index 0000000000..76f6318901 --- /dev/null +++ b/test/test_ibetac_invb_nvrtc_double.cpp @@ -0,0 +1,220 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false +typedef double float_type; +#include +#include +extern "C" __global__ +void test_ibetac_invb_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ibetac_invb(in1[i], in2[i], in3[i]); + } +} +)"; + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibetac_invb_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ibetac_invb_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ibetac_invb_kernel"), "Failed to get kernel function"); + + int numElements = ibeta_data.size() + ibeta_small_data.size(); + float_type *h_in1, *h_in2, *h_in3, *h_out; + float_type *d_in1, *d_in2, *d_in3, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_in3 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + for (int i = 0; i < ibeta_data.size(); ++i) + { + h_in1[i] = ibeta_data[i][0]; + h_in2[i] = ibeta_data[i][1]; + h_in3[i] = ibeta_data[i][2]; + } + for (int i = 0; i < ibeta_small_data.size(); ++i) + { + h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0]; + h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1]; + h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2]; + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + // Sometimes the ignore error policy is ignored so the below throws + // Rather than terminating we can continue to process through our results array + double res; + try + { + res = boost::math::ibetac_invb(h_in1[i], h_in2[i], h_in3[i]); + } + catch (...) + { + continue; + } + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_in3); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_in3; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_ibetac_invb_nvrtc_float.cpp b/test/test_ibetac_invb_nvrtc_float.cpp new file mode 100644 index 0000000000..48d0a31eec --- /dev/null +++ b/test/test_ibetac_invb_nvrtc_float.cpp @@ -0,0 +1,220 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false +typedef float float_type; +#include +#include +extern "C" __global__ +void test_ibetac_invb_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ibetac_invb(in1[i], in2[i], in3[i]); + } +} +)"; + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibetac_invb_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ibetac_invb_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ibetac_invb_kernel"), "Failed to get kernel function"); + + int numElements = ibeta_data.size() + ibeta_small_data.size(); + float_type *h_in1, *h_in2, *h_in3, *h_out; + float_type *d_in1, *d_in2, *d_in3, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_in3 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + for (int i = 0; i < ibeta_data.size(); ++i) + { + h_in1[i] = ibeta_data[i][0]; + h_in2[i] = ibeta_data[i][1]; + h_in3[i] = ibeta_data[i][2]; + } + for (int i = 0; i < ibeta_small_data.size(); ++i) + { + h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0]; + h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1]; + h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2]; + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + // Sometimes the ignore error policy is ignored so the below throws + // Rather than terminating we can continue to process through our results array + double res; + try + { + res = boost::math::ibetac_invb(h_in1[i], h_in2[i], h_in3[i]); + } + catch (...) + { + continue; + } + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_in3); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_in3; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_ibetac_nvrtc_double.cpp b/test/test_ibetac_nvrtc_double.cpp new file mode 100644 index 0000000000..6a59473e18 --- /dev/null +++ b/test/test_ibetac_nvrtc_double.cpp @@ -0,0 +1,207 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_ibetac_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ibetac(in1[i], in2[i], in3[i]); + } +} +)"; + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibetac_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ibetac_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ibetac_kernel"), "Failed to get kernel function"); + + int numElements = ibeta_data.size() + ibeta_small_data.size(); + float_type *h_in1, *h_in2, *h_in3, *h_out; + float_type *d_in1, *d_in2, *d_in3, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_in3 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + for (int i = 0; i < ibeta_data.size(); ++i) + { + h_in1[i] = ibeta_data[i][0]; + h_in2[i] = ibeta_data[i][1]; + h_in3[i] = ibeta_data[i][2]; + } + for (int i = 0; i < ibeta_small_data.size(); ++i) + { + h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0]; + h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1]; + h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2]; + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::ibetac(h_in1[i], h_in2[i], h_in3[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_in3); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_in3; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_ibetac_nvrtc_float.cpp b/test/test_ibetac_nvrtc_float.cpp new file mode 100644 index 0000000000..a989191e51 --- /dev/null +++ b/test/test_ibetac_nvrtc_float.cpp @@ -0,0 +1,207 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_ibetac_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ibetac(in1[i], in2[i], in3[i]); + } +} +)"; + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibetac_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ibetac_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ibetac_kernel"), "Failed to get kernel function"); + + int numElements = ibeta_data.size() + ibeta_small_data.size(); + float_type *h_in1, *h_in2, *h_in3, *h_out; + float_type *d_in1, *d_in2, *d_in3, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_in3 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + for (int i = 0; i < ibeta_data.size(); ++i) + { + h_in1[i] = ibeta_data[i][0]; + h_in2[i] = ibeta_data[i][1]; + h_in3[i] = ibeta_data[i][2]; + } + for (int i = 0; i < ibeta_small_data.size(); ++i) + { + h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0]; + h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1]; + h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2]; + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::ibetac(h_in1[i], h_in2[i], h_in3[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_in3); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_in3; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_igamma.cpp b/test/test_igamma.cpp index 0ad7019963..6e034f3c60 100644 --- a/test/test_igamma.cpp +++ b/test/test_igamma.cpp @@ -12,7 +12,7 @@ # pragma clang diagnostic ignored "-Wliteral-range" #elif defined(__GNUC__) # pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wliteral-range" +# pragma GCC diagnostic ignored "-Woverflow" #endif #include "test_igamma.hpp" diff --git a/test/test_igamma_inv.cpp b/test/test_igamma_inv.cpp index 17e0bfb54f..80a553427c 100644 --- a/test/test_igamma_inv.cpp +++ b/test/test_igamma_inv.cpp @@ -12,7 +12,7 @@ # pragma clang diagnostic ignored "-Wliteral-range" #elif defined(__GNUC__) # pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wliteral-range" +# pragma GCC diagnostic ignored "-Woverflow" #endif #define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error #include "test_igamma_inv.hpp" diff --git a/test/test_igamma_inva.cpp b/test/test_igamma_inva.cpp index 8d0e965962..443ad7bbc6 100644 --- a/test/test_igamma_inva.cpp +++ b/test/test_igamma_inva.cpp @@ -12,7 +12,7 @@ # pragma clang diagnostic ignored "-Wliteral-range" #elif defined(__GNUC__) # pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wliteral-range" +# pragma GCC diagnostic ignored "-Woverflow" #endif #include "test_igamma_inva.hpp" diff --git a/test/test_landau.cpp b/test/test_landau.cpp index 1625b21777..c69c208177 100644 --- a/test/test_landau.cpp +++ b/test/test_landau.cpp @@ -20,7 +20,7 @@ # pragma clang diagnostic ignored "-Wliteral-range" #elif defined(__GNUC__) # pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wliteral-range" +# pragma GCC diagnostic ignored "-Woverflow" #endif using boost::math::landau_distribution; diff --git a/test/test_mapairy.cpp b/test/test_mapairy.cpp index ee8e43bf00..ca3b415d76 100644 --- a/test/test_mapairy.cpp +++ b/test/test_mapairy.cpp @@ -9,7 +9,7 @@ # pragma clang diagnostic ignored "-Wliteral-range" #elif defined(__GNUC__) # pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wliteral-range" +# pragma GCC diagnostic ignored "-Woverflow" #endif #define BOOST_TEST_MAIN