diff --git a/include/boost/math/distributions/beta.hpp b/include/boost/math/distributions/beta.hpp
index 6c17ffa1a2..fef991a870 100644
--- a/include/boost/math/distributions/beta.hpp
+++ b/include/boost/math/distributions/beta.hpp
@@ -25,12 +25,15 @@
 #ifndef BOOST_MATH_DIST_BETA_HPP
 #define BOOST_MATH_DIST_BETA_HPP
 
+#include <boost/math/tools/config.hpp>
+#include <boost/math/tools/tuple.hpp>
 #include <boost/math/distributions/fwd.hpp>
 #include <boost/math/special_functions/beta.hpp> // for beta.
 #include <boost/math/distributions/complement.hpp> // complements.
 #include <boost/math/distributions/detail/common_error_handling.hpp> // error checks
 #include <boost/math/special_functions/fpclassify.hpp> // isnan.
 #include <boost/math/tools/roots.hpp> // for root finding.
+#include <boost/math/policies/error_handling.hpp>
 
 #if defined (BOOST_MSVC)
 #  pragma warning(push)
@@ -38,8 +41,6 @@
 // in domain_error_imp in error_handling
 #endif
 
-#include <utility>
-
 namespace boost
 {
   namespace math
@@ -48,7 +49,7 @@ namespace boost
     {
       // Common error checking routines for beta distribution functions:
       template <class RealType, class Policy>
-      inline bool check_alpha(const char* function, const RealType& alpha, RealType* result, const Policy& pol)
+      BOOST_MATH_GPU_ENABLED inline bool check_alpha(const char* function, const RealType& alpha, RealType* result, const Policy& pol)
       {
         if(!(boost::math::isfinite)(alpha) || (alpha <= 0))
         {
@@ -61,7 +62,7 @@ namespace boost
       } // bool check_alpha
 
       template <class RealType, class Policy>
-      inline bool check_beta(const char* function, const RealType& beta, RealType* result, const Policy& pol)
+      BOOST_MATH_GPU_ENABLED inline bool check_beta(const char* function, const RealType& beta, RealType* result, const Policy& pol)
       {
         if(!(boost::math::isfinite)(beta) || (beta <= 0))
         {
@@ -74,7 +75,7 @@ namespace boost
       } // bool check_beta
 
       template <class RealType, class Policy>
-      inline bool check_prob(const char* function, const RealType& p, RealType* result, const Policy& pol)
+      BOOST_MATH_GPU_ENABLED inline bool check_prob(const char* function, const RealType& p, RealType* result, const Policy& pol)
       {
         if((p < 0) || (p > 1) || !(boost::math::isfinite)(p))
         {
@@ -87,7 +88,7 @@ namespace boost
       } // bool check_prob
 
       template <class RealType, class Policy>
-      inline bool check_x(const char* function, const RealType& x, RealType* result, const Policy& pol)
+      BOOST_MATH_GPU_ENABLED inline bool check_x(const char* function, const RealType& x, RealType* result, const Policy& pol)
       {
         if(!(boost::math::isfinite)(x) || (x < 0) || (x > 1))
         {
@@ -100,28 +101,28 @@ namespace boost
       } // bool check_x
 
       template <class RealType, class Policy>
-      inline bool check_dist(const char* function, const RealType& alpha, const RealType& beta, RealType* result, const Policy& pol)
+      BOOST_MATH_GPU_ENABLED inline bool check_dist(const char* function, const RealType& alpha, const RealType& beta, RealType* result, const Policy& pol)
       { // Check both alpha and beta.
         return check_alpha(function, alpha, result, pol)
           && check_beta(function, beta, result, pol);
       } // bool check_dist
 
       template <class RealType, class Policy>
-      inline bool check_dist_and_x(const char* function, const RealType& alpha, const RealType& beta, RealType x, RealType* result, const Policy& pol)
+      BOOST_MATH_GPU_ENABLED inline bool check_dist_and_x(const char* function, const RealType& alpha, const RealType& beta, RealType x, RealType* result, const Policy& pol)
       {
         return check_dist(function, alpha, beta, result, pol)
           && beta_detail::check_x(function, x, result, pol);
       } // bool check_dist_and_x
 
       template <class RealType, class Policy>
-      inline bool check_dist_and_prob(const char* function, const RealType& alpha, const RealType& beta, RealType p, RealType* result, const Policy& pol)
+      BOOST_MATH_GPU_ENABLED inline bool check_dist_and_prob(const char* function, const RealType& alpha, const RealType& beta, RealType p, RealType* result, const Policy& pol)
       {
         return check_dist(function, alpha, beta, result, pol)
           && check_prob(function, p, result, pol);
       } // bool check_dist_and_prob
 
       template <class RealType, class Policy>
-      inline bool check_mean(const char* function, const RealType& mean, RealType* result, const Policy& pol)
+      BOOST_MATH_GPU_ENABLED inline bool check_mean(const char* function, const RealType& mean, RealType* result, const Policy& pol)
       {
         if(!(boost::math::isfinite)(mean) || (mean <= 0))
         {
@@ -133,7 +134,7 @@ namespace boost
         return true;
       } // bool check_mean
       template <class RealType, class Policy>
-      inline bool check_variance(const char* function, const RealType& variance, RealType* result, const Policy& pol)
+      BOOST_MATH_GPU_ENABLED inline bool check_variance(const char* function, const RealType& variance, RealType* result, const Policy& pol)
       {
         if(!(boost::math::isfinite)(variance) || (variance <= 0))
         {
@@ -157,7 +158,7 @@ namespace boost
       typedef RealType value_type;
       typedef Policy policy_type;
 
-      beta_distribution(RealType l_alpha = 1, RealType l_beta = 1) : m_alpha(l_alpha), m_beta(l_beta)
+      BOOST_MATH_GPU_ENABLED beta_distribution(RealType l_alpha = 1, RealType l_beta = 1) : m_alpha(l_alpha), m_beta(l_beta)
       {
         RealType result;
         beta_detail::check_dist(
@@ -167,11 +168,11 @@ namespace boost
           &result, Policy());
       } // beta_distribution constructor.
       // Accessor functions:
-      RealType alpha() const
+      BOOST_MATH_GPU_ENABLED RealType alpha() const
       {
         return m_alpha;
       }
-      RealType beta() const
+      BOOST_MATH_GPU_ENABLED RealType beta() const
       { // .
         return m_beta;
       }
@@ -183,11 +184,11 @@ namespace boost
       // http://www.itl.nist.gov/div898/handbook/eda/section3/eda366h.htm
       // http://www.epi.ucdavis.edu/diagnostictests/betabuster.html
 
-      static RealType find_alpha(
+      BOOST_MATH_GPU_ENABLED static RealType find_alpha(
         RealType mean, // Expected value of mean.
         RealType variance) // Expected value of variance.
       {
-        static const char* function = "boost::math::beta_distribution<%1%>::find_alpha";
+        constexpr auto function = "boost::math::beta_distribution<%1%>::find_alpha";
         RealType result = 0; // of error checks.
         if(false ==
             (
@@ -201,11 +202,11 @@ namespace boost
         return mean * (( (mean * (1 - mean)) / variance)- 1);
       } // RealType find_alpha
 
-      static RealType find_beta(
+      BOOST_MATH_GPU_ENABLED static RealType find_beta(
         RealType mean, // Expected value of mean.
         RealType variance) // Expected value of variance.
       {
-        static const char* function = "boost::math::beta_distribution<%1%>::find_beta";
+        constexpr auto function = "boost::math::beta_distribution<%1%>::find_beta";
         RealType result = 0; // of error checks.
         if(false ==
             (
@@ -223,12 +224,12 @@ namespace boost
       // Estimate alpha & beta from either alpha or beta, and x and probability.
       // Uses for these parameter estimators are unclear.
 
-      static RealType find_alpha(
+      BOOST_MATH_GPU_ENABLED static RealType find_alpha(
         RealType beta, // from beta.
         RealType x, //  x.
         RealType probability) // cdf
       {
-        static const char* function = "boost::math::beta_distribution<%1%>::find_alpha";
+        constexpr auto function = "boost::math::beta_distribution<%1%>::find_alpha";
         RealType result = 0; // of error checks.
         if(false ==
             (
@@ -245,13 +246,13 @@ namespace boost
         return static_cast<RealType>(ibeta_inva(beta, x, probability, Policy()));
       } // RealType find_alpha(beta, a, probability)
 
-      static RealType find_beta(
+      BOOST_MATH_GPU_ENABLED static RealType find_beta(
         // ibeta_invb(T b, T x, T p); (alpha, x, cdf,)
         RealType alpha, // alpha.
         RealType x, // probability x.
         RealType probability) // probability cdf.
       {
-        static const char* function = "boost::math::beta_distribution<%1%>::find_beta";
+        constexpr auto function = "boost::math::beta_distribution<%1%>::find_beta";
         RealType result = 0; // of error checks.
         if(false ==
             (
@@ -281,27 +282,27 @@ namespace boost
     #endif
 
     template <class RealType, class Policy>
-    inline const std::pair<RealType, RealType> range(const beta_distribution<RealType, Policy>& /* dist */)
+    BOOST_MATH_GPU_ENABLED inline const boost::math::pair<RealType, RealType> range(const beta_distribution<RealType, Policy>& /* dist */)
     { // Range of permissible values for random variable x.
       using boost::math::tools::max_value;
-      return std::pair<RealType, RealType>(static_cast<RealType>(0), static_cast<RealType>(1));
+      return boost::math::pair<RealType, RealType>(static_cast<RealType>(0), static_cast<RealType>(1));
     }
 
     template <class RealType, class Policy>
-    inline const std::pair<RealType, RealType> support(const beta_distribution<RealType, Policy>&  /* dist */)
+    BOOST_MATH_GPU_ENABLED inline const boost::math::pair<RealType, RealType> support(const beta_distribution<RealType, Policy>&  /* dist */)
     { // Range of supported values for random variable x.
       // This is range where cdf rises from 0 to 1, and outside it, the pdf is zero.
-      return std::pair<RealType, RealType>(static_cast<RealType>(0), static_cast<RealType>(1));
+      return boost::math::pair<RealType, RealType>(static_cast<RealType>(0), static_cast<RealType>(1));
     }
 
     template <class RealType, class Policy>
-    inline RealType mean(const beta_distribution<RealType, Policy>& dist)
+    BOOST_MATH_GPU_ENABLED inline RealType mean(const beta_distribution<RealType, Policy>& dist)
     { // Mean of beta distribution = np.
       return  dist.alpha() / (dist.alpha() + dist.beta());
     } // mean
 
     template <class RealType, class Policy>
-    inline RealType variance(const beta_distribution<RealType, Policy>& dist)
+    BOOST_MATH_GPU_ENABLED inline RealType variance(const beta_distribution<RealType, Policy>& dist)
     { // Variance of beta distribution = np(1-p).
       RealType a = dist.alpha();
       RealType b = dist.beta();
@@ -309,9 +310,9 @@ namespace boost
     } // variance
 
     template <class RealType, class Policy>
-    inline RealType mode(const beta_distribution<RealType, Policy>& dist)
+    BOOST_MATH_GPU_ENABLED inline RealType mode(const beta_distribution<RealType, Policy>& dist)
     {
-      static const char* function = "boost::math::mode(beta_distribution<%1%> const&)";
+      constexpr auto function = "boost::math::mode(beta_distribution<%1%> const&)";
 
       RealType result;
       if ((dist.alpha() <= 1))
@@ -343,7 +344,7 @@ namespace boost
     //But WILL be provided by the derived accessor as quantile(0.5).
 
     template <class RealType, class Policy>
-    inline RealType skewness(const beta_distribution<RealType, Policy>& dist)
+    BOOST_MATH_GPU_ENABLED inline RealType skewness(const beta_distribution<RealType, Policy>& dist)
     {
       BOOST_MATH_STD_USING // ADL of std functions.
       RealType a = dist.alpha();
@@ -352,7 +353,7 @@ namespace boost
     } // skewness
 
     template <class RealType, class Policy>
-    inline RealType kurtosis_excess(const beta_distribution<RealType, Policy>& dist)
+    BOOST_MATH_GPU_ENABLED inline RealType kurtosis_excess(const beta_distribution<RealType, Policy>& dist)
     {
       RealType a = dist.alpha();
       RealType b = dist.beta();
@@ -363,17 +364,17 @@ namespace boost
     } // kurtosis_excess
 
     template <class RealType, class Policy>
-    inline RealType kurtosis(const beta_distribution<RealType, Policy>& dist)
+    BOOST_MATH_GPU_ENABLED inline RealType kurtosis(const beta_distribution<RealType, Policy>& dist)
     {
       return 3 + kurtosis_excess(dist);
     } // kurtosis
 
     template <class RealType, class Policy>
-    inline RealType pdf(const beta_distribution<RealType, Policy>& dist, const RealType& x)
+    BOOST_MATH_GPU_ENABLED inline RealType pdf(const beta_distribution<RealType, Policy>& dist, const RealType& x)
     { // Probability Density/Mass Function.
       BOOST_FPU_EXCEPTION_GUARD
 
-      static const char* function = "boost::math::pdf(beta_distribution<%1%> const&, %1%)";
+      constexpr auto function = "boost::math::pdf(beta_distribution<%1%> const&, %1%)";
 
       BOOST_MATH_STD_USING // for ADL of std functions
 
@@ -428,11 +429,11 @@ namespace boost
     } // pdf
 
     template <class RealType, class Policy>
-    inline RealType cdf(const beta_distribution<RealType, Policy>& dist, const RealType& x)
+    BOOST_MATH_GPU_ENABLED inline RealType cdf(const beta_distribution<RealType, Policy>& dist, const RealType& x)
     { // Cumulative Distribution Function beta.
       BOOST_MATH_STD_USING // for ADL of std functions
 
-      static const char* function = "boost::math::cdf(beta_distribution<%1%> const&, %1%)";
+      constexpr auto function = "boost::math::cdf(beta_distribution<%1%> const&, %1%)";
 
       RealType a = dist.alpha();
       RealType b = dist.beta();
@@ -459,12 +460,12 @@ namespace boost
     } // beta cdf
 
     template <class RealType, class Policy>
-    inline RealType cdf(const complemented2_type<beta_distribution<RealType, Policy>, RealType>& c)
+    BOOST_MATH_GPU_ENABLED inline RealType cdf(const complemented2_type<beta_distribution<RealType, Policy>, RealType>& c)
     { // Complemented Cumulative Distribution Function beta.
 
       BOOST_MATH_STD_USING // for ADL of std functions
 
-      static const char* function = "boost::math::cdf(beta_distribution<%1%> const&, %1%)";
+      constexpr auto function = "boost::math::cdf(beta_distribution<%1%> const&, %1%)";
 
       RealType const& x = c.param;
       beta_distribution<RealType, Policy> const& dist = c.dist;
@@ -495,7 +496,7 @@ namespace boost
     } // beta cdf
 
     template <class RealType, class Policy>
-    inline RealType quantile(const beta_distribution<RealType, Policy>& dist, const RealType& p)
+    BOOST_MATH_GPU_ENABLED inline RealType quantile(const beta_distribution<RealType, Policy>& dist, const RealType& p)
     { // Quantile or Percent Point beta function or
       // Inverse Cumulative probability distribution function CDF.
       // Return x (0 <= x <= 1),
@@ -505,7 +506,7 @@ namespace boost
       // will be less than or equal to that value
       // is whatever probability you supplied as an argument.
 
-      static const char* function = "boost::math::quantile(beta_distribution<%1%> const&, %1%)";
+      constexpr auto function = "boost::math::quantile(beta_distribution<%1%> const&, %1%)";
 
       RealType result = 0; // of argument checks:
       RealType a = dist.alpha();
@@ -530,12 +531,12 @@ namespace boost
     } // quantile
 
     template <class RealType, class Policy>
-    inline RealType quantile(const complemented2_type<beta_distribution<RealType, Policy>, RealType>& c)
+    BOOST_MATH_GPU_ENABLED inline RealType quantile(const complemented2_type<beta_distribution<RealType, Policy>, RealType>& c)
     { // Complement Quantile or Percent Point beta function .
       // Return the number of expected x for a given
       // complement of the probability q.
 
-      static const char* function = "boost::math::quantile(beta_distribution<%1%> const&, %1%)";
+      constexpr auto function = "boost::math::quantile(beta_distribution<%1%> const&, %1%)";
 
       //
       // Error checks:
diff --git a/include/boost/math/policies/error_handling.hpp b/include/boost/math/policies/error_handling.hpp
index ce3f1e7ccd..559e70a2f4 100644
--- a/include/boost/math/policies/error_handling.hpp
+++ b/include/boost/math/policies/error_handling.hpp
@@ -12,6 +12,7 @@
 #include <boost/math/tools/numeric_limits.hpp>
 #include <boost/math/tools/type_traits.hpp>
 #include <boost/math/tools/cstdint.hpp>
+#include <boost/math/tools/tuple.hpp>
 
 #ifndef BOOST_MATH_HAS_NVRTC
 
@@ -877,20 +878,6 @@ BOOST_MATH_GPU_ENABLED inline void check_root_iterations(const char* function, s
 
 } //namespace policies
 
-namespace detail{
-
-//
-// Simple helper function to assist in returning a pair from a single value,
-// that value usually comes from one of the error handlers above:
-//
-template <class T>
-BOOST_MATH_GPU_ENABLED std::pair<T, T> pair_from_single(const T& val) BOOST_MATH_NOEXCEPT(T)
-{
-   return std::make_pair(val, val);
-}
-
-}
-
 #ifdef _MSC_VER
 #  pragma warning(pop)
 #endif
@@ -1039,7 +1026,21 @@ BOOST_MATH_GPU_ENABLED inline void check_root_iterations(const char* function, b
 } // namespace math
 } // namespace boost
 
-#endif
+#endif // BOOST_MATH_HAS_NVRTC
+
+namespace boost { namespace math { namespace detail {
+
+//
+// Simple helper function to assist in returning a pair from a single value,
+// that value usually comes from one of the error handlers above:
+//
+template <class T>
+BOOST_MATH_GPU_ENABLED boost::math::pair<T, T> pair_from_single(const T& val) BOOST_MATH_NOEXCEPT(T)
+{
+   return boost::math::make_pair(val, val);
+}
+
+}}} // boost::math::detail
 
 #endif // BOOST_MATH_POLICY_ERROR_HANDLING_HPP
 
diff --git a/include/boost/math/special_functions/beta.hpp b/include/boost/math/special_functions/beta.hpp
index 00b8e45bf2..27901a1131 100644
--- a/include/boost/math/special_functions/beta.hpp
+++ b/include/boost/math/special_functions/beta.hpp
@@ -28,14 +28,10 @@
 #include <boost/math/policies/policy.hpp>
 #include <boost/math/policies/error_handling.hpp>
 #include <boost/math/constants/constants.hpp>
-
-#ifndef BOOST_MATH_HAS_NVRTC
 #include <boost/math/special_functions/math_fwd.hpp>
 #include <boost/math/special_functions/binomial.hpp>
 #include <boost/math/special_functions/factorials.hpp>
 #include <boost/math/tools/roots.hpp>
-#include <cmath>
-#endif
 
 namespace boost{ namespace math{
 
@@ -800,7 +796,7 @@ BOOST_MATH_GPU_ENABLED T ibeta_series(T a, T b, T x, T s0, const boost::math::la
    policies::check_series_iterations<T>("boost::math::ibeta<%1%>(%1%, %1%, %1%) in ibeta_series (without lanczos)", max_iter, pol);
    return result;
 }
-
+#endif
 //
 // Continued fraction for the incomplete beta:
 //
@@ -884,7 +880,7 @@ BOOST_MATH_GPU_ENABLED T ibeta_a_step(T a, T b, T x, T y, int k, const Policy& p
 
    return prefix;
 }
-#endif
+
 //
 // This function is only needed for the non-regular incomplete beta,
 // it computes the delta in:
@@ -958,7 +954,6 @@ struct Pn_size<long double>
 #endif
 };
 
-#ifndef BOOST_MATH_HAS_GPU_SUPPORT
 template <class T, class Policy>
 BOOST_MATH_GPU_ENABLED T beta_small_b_large_a_series(T a, T b, T x, T y, T s0, T mult, const Policy& pol, bool normalised)
 {
@@ -1060,7 +1055,7 @@ BOOST_MATH_GPU_ENABLED T beta_small_b_large_a_series(T a, T b, T x, T y, T s0, T
    }
    return sum;
 } // template <class T, class Lanczos>T beta_small_b_large_a_series(T a, T b, T x, T y, T s0, T mult, const Lanczos& l, bool normalised)
-#endif
+
 //
 // For integer arguments we can relate the incomplete beta to the
 // complement of the binomial distribution cdf and use this finite sum.
@@ -1130,6 +1125,7 @@ BOOST_MATH_GPU_ENABLED T binomial_ccdf(T n, T k, T x, T y, const Policy& pol)
 // input range and select the right implementation method for
 // each domain:
 //
+
 template <class T, class Policy>
 BOOST_MATH_GPU_ENABLED T ibeta_imp(T a, T b, T x, const Policy& pol, bool inv, bool normalised, T* p_derivative)
 {
@@ -1749,12 +1745,7 @@ BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<RT1, RT2, RT3>::type
 } // namespace math
 } // namespace boost
 
-// TODO(mborland): Get the ibeta_inv working on NVRTC
-#ifndef BOOST_MATH_HAS_NVRTC
-
 #include <boost/math/special_functions/detail/ibeta_inverse.hpp>
 #include <boost/math/special_functions/detail/ibeta_inv_ab.hpp>
 
-#endif
-
 #endif // BOOST_MATH_SPECIAL_BETA_HPP
diff --git a/include/boost/math/special_functions/binomial.hpp b/include/boost/math/special_functions/binomial.hpp
index e776a90bb8..3c49ff30d5 100644
--- a/include/boost/math/special_functions/binomial.hpp
+++ b/include/boost/math/special_functions/binomial.hpp
@@ -10,20 +10,21 @@
 #pragma once
 #endif
 
+#include <boost/math/tools/config.hpp>
+#include <boost/math/tools/type_traits.hpp>
 #include <boost/math/special_functions/math_fwd.hpp>
 #include <boost/math/special_functions/factorials.hpp>
 #include <boost/math/special_functions/beta.hpp>
 #include <boost/math/policies/error_handling.hpp>
-#include <type_traits>
 
 namespace boost{ namespace math{
 
 template <class T, class Policy>
-T binomial_coefficient(unsigned n, unsigned k, const Policy& pol)
+BOOST_MATH_GPU_ENABLED T binomial_coefficient(unsigned n, unsigned k, const Policy& pol)
 {
-   static_assert(!std::is_integral<T>::value, "Type T must not be an integral type");
+   static_assert(!boost::math::is_integral<T>::value, "Type T must not be an integral type");
    BOOST_MATH_STD_USING
-   static const char* function = "boost::math::binomial_coefficient<%1%>(unsigned, unsigned)";
+   constexpr auto function = "boost::math::binomial_coefficient<%1%>(unsigned, unsigned)";
    if(k > n)
       return policies::raise_domain_error<T>(function, "The binomial coefficient is undefined for k > n, but got k = %1%.", static_cast<T>(k), pol);
    T result;  // LCOV_EXCL_LINE
@@ -43,9 +44,9 @@ T binomial_coefficient(unsigned n, unsigned k, const Policy& pol)
    {
       // Use the beta function:
       if(k < n - k)
-         result = static_cast<T>(k * beta(static_cast<T>(k), static_cast<T>(n-k+1), pol));
+         result = static_cast<T>(k * boost::math::beta(static_cast<T>(k), static_cast<T>(n-k+1), pol));
       else
-         result = static_cast<T>((n - k) * beta(static_cast<T>(k+1), static_cast<T>(n-k), pol));
+         result = static_cast<T>((n - k) * boost::math::beta(static_cast<T>(k+1), static_cast<T>(n-k), pol));
       if(result == 0)
          return policies::raise_overflow_error<T>(function, nullptr, pol);
       result = 1 / result;
@@ -59,7 +60,7 @@ T binomial_coefficient(unsigned n, unsigned k, const Policy& pol)
 // we'll promote to double:
 //
 template <>
-inline float binomial_coefficient<float, policies::policy<> >(unsigned n, unsigned k, const policies::policy<>&)
+BOOST_MATH_GPU_ENABLED inline float binomial_coefficient<float, policies::policy<> >(unsigned n, unsigned k, const policies::policy<>&)
 {
    typedef policies::normalise<
        policies::policy<>,
@@ -71,7 +72,7 @@ inline float binomial_coefficient<float, policies::policy<> >(unsigned n, unsign
 }
 
 template <class T>
-inline T binomial_coefficient(unsigned n, unsigned k)
+BOOST_MATH_GPU_ENABLED inline T binomial_coefficient(unsigned n, unsigned k)
 {
    return binomial_coefficient<T>(n, k, policies::policy<>());
 }
diff --git a/include/boost/math/special_functions/cbrt.hpp b/include/boost/math/special_functions/cbrt.hpp
index fb05996cf1..7fdf78d014 100644
--- a/include/boost/math/special_functions/cbrt.hpp
+++ b/include/boost/math/special_functions/cbrt.hpp
@@ -11,15 +11,16 @@
 #pragma once
 #endif
 
-#ifndef __CUDACC_RTC__
-
 #include <boost/math/tools/config.hpp>
+
+#ifndef BOOST_MATH_HAS_NVRTC
+
 #include <boost/math/tools/rational.hpp>
+#include <boost/math/tools/type_traits.hpp>
+#include <boost/math/tools/cstdint.hpp>
 #include <boost/math/policies/error_handling.hpp>
 #include <boost/math/special_functions/math_fwd.hpp>
 #include <boost/math/special_functions/fpclassify.hpp>
-#include <type_traits>
-#include <cstdint>
 
 namespace boost{ namespace math{
 
@@ -174,19 +175,30 @@ BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T>::type cbrt(T z)
 } // namespace math
 } // namespace boost
 
-#else
+#else // Special NVRTC handling
 
 namespace boost {
 namespace math {
 
 template <typename T>
-__host__ __device__ T cbrt(T x)
+BOOST_MATH_GPU_ENABLED double cbrt(T x)
+{
+   return ::cbrt(x);
+}
+
+BOOST_MATH_GPU_ENABLED inline float cbrt(float x)
+{
+   return ::cbrtf(x);
+}
+
+template <typename T, typename Policy>
+BOOST_MATH_GPU_ENABLED double cbrt(T x, const Policy&)
 {
    return ::cbrt(x);
 }
 
-template <>
-__host__ __device__ float cbrt(float x)
+template <typename Policy>
+BOOST_MATH_GPU_ENABLED float cbrt(float x, const Policy&)
 {
    return ::cbrtf(x);
 }
diff --git a/include/boost/math/special_functions/detail/ibeta_inv_ab.hpp b/include/boost/math/special_functions/detail/ibeta_inv_ab.hpp
index 9e30db2a37..aab18f50f1 100644
--- a/include/boost/math/special_functions/detail/ibeta_inv_ab.hpp
+++ b/include/boost/math/special_functions/detail/ibeta_inv_ab.hpp
@@ -17,17 +17,19 @@
 #pragma once
 #endif
 
-#include <cstdint>
-#include <utility>
+#include <boost/math/tools/config.hpp>
 #include <boost/math/tools/toms748_solve.hpp>
+#include <boost/math/tools/precision.hpp>
+#include <boost/math/tools/tuple.hpp>
+#include <boost/math/policies/error_handling.hpp>
 
 namespace boost{ namespace math{ namespace detail{
 
 template <class T, class Policy>
 struct beta_inv_ab_t
 {
-   beta_inv_ab_t(T b_, T z_, T p_, bool invert_, bool swap_ab_) : b(b_), z(z_), p(p_), invert(invert_), swap_ab(swap_ab_) {}
-   T operator()(T a)
+   BOOST_MATH_GPU_ENABLED beta_inv_ab_t(T b_, T z_, T p_, bool invert_, bool swap_ab_) : b(b_), z(z_), p(p_), invert(invert_), swap_ab(swap_ab_) {}
+   BOOST_MATH_GPU_ENABLED T operator()(T a)
    {
       return invert ? 
          p - boost::math::ibetac(swap_ab ? b : a, swap_ab ? a : b, z, Policy()) 
@@ -39,7 +41,7 @@ struct beta_inv_ab_t
 };
 
 template <class T, class Policy>
-T inverse_negative_binomial_cornish_fisher(T n, T sf, T sfc, T p, T q, const Policy& pol)
+BOOST_MATH_GPU_ENABLED T inverse_negative_binomial_cornish_fisher(T n, T sf, T sfc, T p, T q, const Policy& pol)
 {
    BOOST_MATH_STD_USING
    // mean:
@@ -72,7 +74,7 @@ T inverse_negative_binomial_cornish_fisher(T n, T sf, T sfc, T p, T q, const Pol
 }
 
 template <class T, class Policy>
-T ibeta_inv_ab_imp(const T& b, const T& z, const T& p, const T& q, bool swap_ab, const Policy& pol)
+BOOST_MATH_GPU_ENABLED T ibeta_inv_ab_imp(const T& b, const T& z, const T& p, const T& q, bool swap_ab, const Policy& pol)
 {
    BOOST_MATH_STD_USING  // for ADL of std lib math functions
    //
@@ -121,11 +123,11 @@ T ibeta_inv_ab_imp(const T& b, const T& z, const T& p, const T& q, bool swap_ab,
       //
       if((p < q) != swap_ab)
       {
-         guess = (std::min)(T(b * 2), T(1));
+         guess = BOOST_MATH_GPU_SAFE_MIN(T(b * 2), T(1));
       }
       else
       {
-         guess = (std::min)(T(b / 2), T(1));
+         guess = BOOST_MATH_GPU_SAFE_MIN(T(b / 2), T(1));
       }
    }
    if(n * n * n * u * sf > 0.005)
@@ -138,11 +140,11 @@ T ibeta_inv_ab_imp(const T& b, const T& z, const T& p, const T& q, bool swap_ab,
       //
       if((p < q) != swap_ab)
       {
-         guess = (std::min)(T(b * 2), T(10));
+         guess = BOOST_MATH_GPU_SAFE_MIN(T(b * 2), T(10));
       }
       else
       {
-         guess = (std::min)(T(b / 2), T(10));
+         guess = BOOST_MATH_GPU_SAFE_MIN(T(b / 2), T(10));
       }
    }
    else
@@ -151,8 +153,8 @@ T ibeta_inv_ab_imp(const T& b, const T& z, const T& p, const T& q, bool swap_ab,
    //
    // Max iterations permitted:
    //
-   std::uintmax_t max_iter = policies::get_max_root_iterations<Policy>();
-   std::pair<T, T> r = bracket_and_solve_root(f, guess, factor, swap_ab ? true : false, tol, max_iter, pol);
+   boost::math::uintmax_t max_iter = policies::get_max_root_iterations<Policy>();
+   boost::math::pair<T, T> r = bracket_and_solve_root(f, guess, factor, swap_ab ? true : false, tol, max_iter, pol);
    if(max_iter >= policies::get_max_root_iterations<Policy>())
       return policies::raise_evaluation_error<T>("boost::math::ibeta_invab_imp<%1%>(%1%,%1%,%1%)", "Unable to locate the root within a reasonable number of iterations, closest approximation so far was %1%", r.first, pol);
    return (r.first + r.second) / 2;
@@ -161,7 +163,7 @@ T ibeta_inv_ab_imp(const T& b, const T& z, const T& p, const T& q, bool swap_ab,
 } // namespace detail
 
 template <class RT1, class RT2, class RT3, class Policy>
-typename tools::promote_args<RT1, RT2, RT3>::type 
+BOOST_MATH_GPU_ENABLED typename tools::promote_args<RT1, RT2, RT3>::type 
       ibeta_inva(RT1 b, RT2 x, RT3 p, const Policy& pol)
 {
    typedef typename tools::promote_args<RT1, RT2, RT3>::type result_type;
@@ -194,7 +196,7 @@ typename tools::promote_args<RT1, RT2, RT3>::type
 }
 
 template <class RT1, class RT2, class RT3, class Policy>
-typename tools::promote_args<RT1, RT2, RT3>::type
+BOOST_MATH_GPU_ENABLED typename tools::promote_args<RT1, RT2, RT3>::type
       ibetac_inva(RT1 b, RT2 x, RT3 q, const Policy& pol)
 {
    typedef typename tools::promote_args<RT1, RT2, RT3>::type result_type;
@@ -227,7 +229,7 @@ typename tools::promote_args<RT1, RT2, RT3>::type
 }
 
 template <class RT1, class RT2, class RT3, class Policy>
-typename tools::promote_args<RT1, RT2, RT3>::type
+BOOST_MATH_GPU_ENABLED typename tools::promote_args<RT1, RT2, RT3>::type
       ibeta_invb(RT1 a, RT2 x, RT3 p, const Policy& pol)
 {
    typedef typename tools::promote_args<RT1, RT2, RT3>::type result_type;
@@ -260,7 +262,7 @@ typename tools::promote_args<RT1, RT2, RT3>::type
 }
 
 template <class RT1, class RT2, class RT3, class Policy>
-typename tools::promote_args<RT1, RT2, RT3>::type
+BOOST_MATH_GPU_ENABLED typename tools::promote_args<RT1, RT2, RT3>::type
       ibetac_invb(RT1 a, RT2 x, RT3 q, const Policy& pol)
 {
    constexpr auto function = "boost::math::ibeta_invb<%1%>(%1%, %1%, %1%)";
@@ -293,28 +295,28 @@ typename tools::promote_args<RT1, RT2, RT3>::type
 }
 
 template <class RT1, class RT2, class RT3>
-inline typename tools::promote_args<RT1, RT2, RT3>::type 
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<RT1, RT2, RT3>::type 
          ibeta_inva(RT1 b, RT2 x, RT3 p)
 {
    return boost::math::ibeta_inva(b, x, p, policies::policy<>());
 }
 
 template <class RT1, class RT2, class RT3>
-inline typename tools::promote_args<RT1, RT2, RT3>::type 
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<RT1, RT2, RT3>::type 
          ibetac_inva(RT1 b, RT2 x, RT3 q)
 {
    return boost::math::ibetac_inva(b, x, q, policies::policy<>());
 }
 
 template <class RT1, class RT2, class RT3>
-inline typename tools::promote_args<RT1, RT2, RT3>::type 
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<RT1, RT2, RT3>::type 
          ibeta_invb(RT1 a, RT2 x, RT3 p)
 {
    return boost::math::ibeta_invb(a, x, p, policies::policy<>());
 }
 
 template <class RT1, class RT2, class RT3>
-inline typename tools::promote_args<RT1, RT2, RT3>::type 
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<RT1, RT2, RT3>::type 
          ibetac_invb(RT1 a, RT2 x, RT3 q)
 {
    return boost::math::ibetac_invb(a, x, q, policies::policy<>());
diff --git a/include/boost/math/special_functions/detail/ibeta_inverse.hpp b/include/boost/math/special_functions/detail/ibeta_inverse.hpp
index 9e4fb08d4e..90f6e90705 100644
--- a/include/boost/math/special_functions/detail/ibeta_inverse.hpp
+++ b/include/boost/math/special_functions/detail/ibeta_inverse.hpp
@@ -11,12 +11,14 @@
 #pragma once
 #endif
 
+#include <boost/math/tools/config.hpp>
+#include <boost/math/tools/precision.hpp>
+#include <boost/math/tools/roots.hpp>
+#include <boost/math/tools/tuple.hpp>
 #include <boost/math/special_functions/beta.hpp>
 #include <boost/math/special_functions/erf.hpp>
-#include <boost/math/tools/roots.hpp>
 #include <boost/math/special_functions/detail/t_distribution_inv.hpp>
 #include <boost/math/special_functions/fpclassify.hpp>
-#include <boost/math/tools/precision.hpp>
 
 namespace boost{ namespace math{ namespace detail{
 
@@ -27,12 +29,12 @@ namespace boost{ namespace math{ namespace detail{
 template <class T>
 struct temme_root_finder
 {
-   temme_root_finder(const T t_, const T a_) : t(t_), a(a_) {
+   BOOST_MATH_GPU_ENABLED temme_root_finder(const T t_, const T a_) : t(t_), a(a_) {
       BOOST_MATH_ASSERT(
          math::tools::epsilon<T>() <= a && !(boost::math::isinf)(a));
    }
 
-   boost::math::tuple<T, T> operator()(T x)
+   BOOST_MATH_GPU_ENABLED boost::math::tuple<T, T> operator()(T x)
    {
       BOOST_MATH_STD_USING // ADL of std names
 
@@ -52,7 +54,7 @@ struct temme_root_finder
 // Section 2.
 //
 template <class T, class Policy>
-T temme_method_1_ibeta_inverse(T a, T b, T z, const Policy& pol)
+BOOST_MATH_GPU_ENABLED T temme_method_1_ibeta_inverse(T a, T b, T z, const Policy& pol)
 {
    BOOST_MATH_STD_USING // ADL of std names
 
@@ -138,7 +140,7 @@ T temme_method_1_ibeta_inverse(T a, T b, T z, const Policy& pol)
 // Section 3.
 //
 template <class T, class Policy>
-T temme_method_2_ibeta_inverse(T /*a*/, T /*b*/, T z, T r, T theta, const Policy& pol)
+BOOST_MATH_GPU_ENABLED T temme_method_2_ibeta_inverse(T /*a*/, T /*b*/, T z, T r, T theta, const Policy& pol)
 {
    BOOST_MATH_STD_USING // ADL of std names
 
@@ -315,7 +317,7 @@ T temme_method_2_ibeta_inverse(T /*a*/, T /*b*/, T z, T r, T theta, const Policy
 // Section 4.
 //
 template <class T, class Policy>
-T temme_method_3_ibeta_inverse(T a, T b, T p, T q, const Policy& pol)
+BOOST_MATH_GPU_ENABLED T temme_method_3_ibeta_inverse(T a, T b, T p, T q, const Policy& pol)
 {
    BOOST_MATH_STD_USING // ADL of std names
 
@@ -420,10 +422,10 @@ T temme_method_3_ibeta_inverse(T a, T b, T p, T q, const Policy& pol)
 template <class T, class Policy>
 struct ibeta_roots
 {
-   ibeta_roots(T _a, T _b, T t, bool inv = false)
+   BOOST_MATH_GPU_ENABLED ibeta_roots(T _a, T _b, T t, bool inv = false)
       : a(_a), b(_b), target(t), invert(inv) {}
 
-   boost::math::tuple<T, T, T> operator()(T x)
+   BOOST_MATH_GPU_ENABLED boost::math::tuple<T, T, T> operator()(T x)
    {
       BOOST_MATH_STD_USING // ADL of std names
 
@@ -457,7 +459,7 @@ struct ibeta_roots
 };
 
 template <class T, class Policy>
-T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py)
+BOOST_MATH_GPU_ENABLED T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py)
 {
    BOOST_MATH_STD_USING  // For ADL of math functions.
 
@@ -487,8 +489,8 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py)
          return p;
       }
       // Change things around so we can handle as b == 1 special case below:
-      std::swap(a, b);
-      std::swap(p, q);
+      BOOST_MATH_GPU_SAFE_SWAP(a, b);
+      BOOST_MATH_GPU_SAFE_SWAP(p, q);
       invert = true;
    }
    //
@@ -524,8 +526,8 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py)
       }
       else if(b > 0.5f)
       {
-         std::swap(a, b);
-         std::swap(p, q);
+         BOOST_MATH_GPU_SAFE_SWAP(a, b);
+         BOOST_MATH_GPU_SAFE_SWAP(p, q);
          invert = !invert;
       }
    }
@@ -559,7 +561,7 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py)
          y = -boost::math::expm1(boost::math::log1p(-q, pol) / a, pol);
       }
       if(invert)
-         std::swap(x, y);
+         BOOST_MATH_GPU_SAFE_SWAP(x, y);
       if(py)
          *py = y;
       return x;
@@ -574,12 +576,12 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py)
       //
       if(p > 0.5)
       {
-         std::swap(a, b);
-         std::swap(p, q);
+         BOOST_MATH_GPU_SAFE_SWAP(a, b);
+         BOOST_MATH_GPU_SAFE_SWAP(p, q);
          invert = !invert;
       }
-      T minv = (std::min)(a, b);
-      T maxv = (std::max)(a, b);
+      T minv = BOOST_MATH_GPU_SAFE_MIN(a, b);
+      T maxv = BOOST_MATH_GPU_SAFE_MAX(a, b);
       if((sqrt(minv) > (maxv - minv)) && (minv > 5))
       {
          //
@@ -630,8 +632,8 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py)
             //
             if(a < b)
             {
-               std::swap(a, b);
-               std::swap(p, q);
+               BOOST_MATH_GPU_SAFE_SWAP(a, b);
+               BOOST_MATH_GPU_SAFE_SWAP(p, q);
                invert = !invert;
             }
             //
@@ -694,8 +696,8 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py)
       }
       if(fs < 0)
       {
-         std::swap(a, b);
-         std::swap(p, q);
+         BOOST_MATH_GPU_SAFE_SWAP(a, b);
+         BOOST_MATH_GPU_SAFE_SWAP(p, q);
          invert = !invert;
          xs = 1 - xs;
       }
@@ -758,9 +760,9 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py)
 
       if(ps < 0)
       {
-         std::swap(a, b);
-         std::swap(p, q);
-         std::swap(xs, xs2);
+         BOOST_MATH_GPU_SAFE_SWAP(a, b);
+         BOOST_MATH_GPU_SAFE_SWAP(p, q);
+         BOOST_MATH_GPU_SAFE_SWAP(xs, xs2);
          invert = !invert;
       }
       //
@@ -823,8 +825,8 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py)
       //
       if(b < a)
       {
-         std::swap(a, b);
-         std::swap(p, q);
+         BOOST_MATH_GPU_SAFE_SWAP(a, b);
+         BOOST_MATH_GPU_SAFE_SWAP(p, q);
          invert = !invert;
       }
       if (a < tools::min_value<T>())
@@ -890,9 +892,9 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py)
    //
    if(x > 0.5)
    {
-      std::swap(a, b);
-      std::swap(p, q);
-      std::swap(x, y);
+      BOOST_MATH_GPU_SAFE_SWAP(a, b);
+      BOOST_MATH_GPU_SAFE_SWAP(p, q);
+      BOOST_MATH_GPU_SAFE_SWAP(x, y);
       invert = !invert;
       T l = 1 - upper;
       T u = 1 - lower;
@@ -922,8 +924,8 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py)
       if(x < lower)
          x = lower;
    }
-   std::uintmax_t max_iter = policies::get_max_root_iterations<Policy>();
-   std::uintmax_t max_iter_used = 0;
+   boost::math::uintmax_t max_iter = policies::get_max_root_iterations<Policy>();
+   boost::math::uintmax_t max_iter_used = 0;
    //
    // Figure out how many digits to iterate towards:
    //
@@ -946,7 +948,13 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py)
    // Now iterate, we can use either p or q as the target here
    // depending on which is smaller:
    //
+   // Since we can't use halley_iterate on device we use newton raphson
+   //
+   #ifndef BOOST_MATH_HAS_GPU_SUPPORT
    x = boost::math::tools::halley_iterate(
+   #else
+   x = boost::math::tools::newton_raphson_iterate(
+   #endif
       boost::math::detail::ibeta_roots<T, Policy>(a, b, (p < q ? p : q), (p < q ? false : true)), x, lower, upper, digits, max_iter);
    policies::check_root_iterations<T>("boost::math::ibeta<%1%>(%1%, %1%, %1%)", max_iter + max_iter_used, pol);
    //
@@ -968,7 +976,7 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py)
 } // namespace detail
 
 template <class T1, class T2, class T3, class T4, class Policy>
-inline typename tools::promote_args<T1, T2, T3, T4>::type
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T1, T2, T3, T4>::type
    ibeta_inv(T1 a, T2 b, T3 p, T4* py, const Policy& pol)
 {
    constexpr auto function = "boost::math::ibeta_inv<%1%>(%1%,%1%,%1%)";
@@ -1003,14 +1011,14 @@ inline typename tools::promote_args<T1, T2, T3, T4>::type
 }
 
 template <class T1, class T2, class T3, class T4>
-inline typename tools::promote_args<T1, T2, T3, T4>::type
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T1, T2, T3, T4>::type
    ibeta_inv(T1 a, T2 b, T3 p, T4* py)
 {
    return ibeta_inv(a, b, p, py, policies::policy<>());
 }
 
 template <class T1, class T2, class T3>
-inline typename tools::promote_args<T1, T2, T3>::type
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T1, T2, T3>::type
    ibeta_inv(T1 a, T2 b, T3 p)
 {
    typedef typename tools::promote_args<T1, T2, T3>::type result_type;
@@ -1018,7 +1026,7 @@ inline typename tools::promote_args<T1, T2, T3>::type
 }
 
 template <class T1, class T2, class T3, class Policy>
-inline typename tools::promote_args<T1, T2, T3>::type
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T1, T2, T3>::type
    ibeta_inv(T1 a, T2 b, T3 p, const Policy& pol)
 {
    typedef typename tools::promote_args<T1, T2, T3>::type result_type;
@@ -1026,7 +1034,7 @@ inline typename tools::promote_args<T1, T2, T3>::type
 }
 
 template <class T1, class T2, class T3, class T4, class Policy>
-inline typename tools::promote_args<T1, T2, T3, T4>::type
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T1, T2, T3, T4>::type
    ibetac_inv(T1 a, T2 b, T3 q, T4* py, const Policy& pol)
 {
    constexpr auto function = "boost::math::ibetac_inv<%1%>(%1%,%1%,%1%)";
@@ -1061,14 +1069,14 @@ inline typename tools::promote_args<T1, T2, T3, T4>::type
 }
 
 template <class T1, class T2, class T3, class T4>
-inline typename tools::promote_args<T1, T2, T3, T4>::type
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T1, T2, T3, T4>::type
    ibetac_inv(T1 a, T2 b, T3 q, T4* py)
 {
    return ibetac_inv(a, b, q, py, policies::policy<>());
 }
 
 template <class RT1, class RT2, class RT3>
-inline typename tools::promote_args<RT1, RT2, RT3>::type
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<RT1, RT2, RT3>::type
    ibetac_inv(RT1 a, RT2 b, RT3 q)
 {
    typedef typename tools::promote_args<RT1, RT2, RT3>::type result_type;
@@ -1076,7 +1084,7 @@ inline typename tools::promote_args<RT1, RT2, RT3>::type
 }
 
 template <class RT1, class RT2, class RT3, class Policy>
-inline typename tools::promote_args<RT1, RT2, RT3>::type
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<RT1, RT2, RT3>::type
    ibetac_inv(RT1 a, RT2 b, RT3 q, const Policy& pol)
 {
    typedef typename tools::promote_args<RT1, RT2, RT3>::type result_type;
diff --git a/include/boost/math/special_functions/detail/t_distribution_inv.hpp b/include/boost/math/special_functions/detail/t_distribution_inv.hpp
index 9209b6d405..22e1d11f02 100644
--- a/include/boost/math/special_functions/detail/t_distribution_inv.hpp
+++ b/include/boost/math/special_functions/detail/t_distribution_inv.hpp
@@ -11,6 +11,9 @@
 #pragma once
 #endif
 
+#include <boost/math/tools/config.hpp>
+#include <boost/math/tools/type_traits.hpp>
+#include <boost/math/tools/numeric_limits.hpp>
 #include <boost/math/special_functions/cbrt.hpp>
 #include <boost/math/special_functions/round.hpp>
 #include <boost/math/special_functions/trunc.hpp>
@@ -24,7 +27,7 @@ namespace boost{ namespace math{ namespace detail{
 // Communications of the ACM, 13(10): 619-620, Oct., 1970.
 //
 template <class T, class Policy>
-T inverse_students_t_hill(T ndf, T u, const Policy& pol)
+BOOST_MATH_GPU_ENABLED T inverse_students_t_hill(T ndf, T u, const Policy& pol)
 {
    BOOST_MATH_STD_USING
    BOOST_MATH_ASSERT(u <= 0.5);
@@ -74,7 +77,7 @@ T inverse_students_t_hill(T ndf, T u, const Policy& pol)
 // Journal of Computational Finance, Vol 9 Issue 4, pp 37-73, Summer 2006
 //
 template <class T, class Policy>
-T inverse_students_t_tail_series(T df, T v, const Policy& pol)
+BOOST_MATH_GPU_ENABLED T inverse_students_t_tail_series(T df, T v, const Policy& pol)
 {
    BOOST_MATH_STD_USING
    // Tail series expansion, see section 6 of Shaw's paper.
@@ -125,7 +128,7 @@ T inverse_students_t_tail_series(T df, T v, const Policy& pol)
 }
 
 template <class T, class Policy>
-T inverse_students_t_body_series(T df, T u, const Policy& pol)
+BOOST_MATH_GPU_ENABLED T inverse_students_t_body_series(T df, T u, const Policy& pol)
 {
    BOOST_MATH_STD_USING
    //
@@ -204,7 +207,7 @@ T inverse_students_t_body_series(T df, T u, const Policy& pol)
 }
 
 template <class T, class Policy>
-T inverse_students_t(T df, T u, T v, const Policy& pol, bool* pexact = nullptr)
+BOOST_MATH_GPU_ENABLED T inverse_students_t(T df, T u, T v, const Policy& pol, bool* pexact = nullptr)
 {
    //
    // df = number of degrees of freedom.
@@ -220,7 +223,7 @@ T inverse_students_t(T df, T u, T v, const Policy& pol, bool* pexact = nullptr)
    if(u > v)
    {
       // function is symmetric, invert it:
-      std::swap(u, v);
+      BOOST_MATH_GPU_SAFE_SWAP(u, v);
       invert = true;
    }
    if((floor(df) == df) && (df < 20))
@@ -416,7 +419,7 @@ T inverse_students_t(T df, T u, T v, const Policy& pol, bool* pexact = nullptr)
 }
 
 template <class T, class Policy>
-inline T find_ibeta_inv_from_t_dist(T a, T p, T /*q*/, T* py, const Policy& pol)
+BOOST_MATH_GPU_ENABLED inline T find_ibeta_inv_from_t_dist(T a, T p, T /*q*/, T* py, const Policy& pol)
 {
    T u = p / 2;
    T v = 1 - u;
@@ -427,7 +430,7 @@ inline T find_ibeta_inv_from_t_dist(T a, T p, T /*q*/, T* py, const Policy& pol)
 }
 
 template <class T, class Policy>
-inline T fast_students_t_quantile_imp(T df, T p, const Policy& pol, const std::false_type*)
+BOOST_MATH_GPU_ENABLED inline T fast_students_t_quantile_imp(T df, T p, const Policy& pol, const boost::math::false_type*)
 {
    BOOST_MATH_STD_USING
    //
@@ -450,12 +453,12 @@ inline T fast_students_t_quantile_imp(T df, T p, const Policy& pol, const std::f
 }
 
 template <class T, class Policy>
-T fast_students_t_quantile_imp(T df, T p, const Policy& pol, const std::true_type*)
+BOOST_MATH_GPU_ENABLED T fast_students_t_quantile_imp(T df, T p, const Policy& pol, const boost::math::true_type*)
 {
    BOOST_MATH_STD_USING
    bool invert = false;
    if((df < 2) && (floor(df) != df))
-      return boost::math::detail::fast_students_t_quantile_imp(df, p, pol, static_cast<std::false_type*>(nullptr));
+      return boost::math::detail::fast_students_t_quantile_imp(df, p, pol, static_cast<boost::math::false_type*>(nullptr));
    if(p > 0.5)
    {
       p = 1 - p;
@@ -521,7 +524,7 @@ T fast_students_t_quantile_imp(T df, T p, const Policy& pol, const std::true_typ
 }
 
 template <class T, class Policy>
-inline T fast_students_t_quantile(T df, T p, const Policy& pol)
+BOOST_MATH_GPU_ENABLED inline T fast_students_t_quantile(T df, T p, const Policy& pol)
 {
    typedef typename policies::evaluation<T, Policy>::type value_type;
    typedef typename policies::normalise<
@@ -531,12 +534,12 @@ inline T fast_students_t_quantile(T df, T p, const Policy& pol)
       policies::discrete_quantile<>,
       policies::assert_undefined<> >::type forwarding_policy;
 
-   typedef std::integral_constant<bool,
-      (std::numeric_limits<T>::digits <= 53)
+   typedef boost::math::integral_constant<bool,
+      (boost::math::numeric_limits<T>::digits <= 53)
        &&
-      (std::numeric_limits<T>::is_specialized)
+      (boost::math::numeric_limits<T>::is_specialized)
        &&
-      (std::numeric_limits<T>::radix == 2)
+      (boost::math::numeric_limits<T>::radix == 2)
    > tag_type;
    return policies::checked_narrowing_cast<T, forwarding_policy>(fast_students_t_quantile_imp(static_cast<value_type>(df), static_cast<value_type>(p), pol, static_cast<tag_type*>(nullptr)), "boost::math::students_t_quantile<%1%>(%1%,%1%,%1%)");
 }
diff --git a/include/boost/math/special_functions/gamma.hpp b/include/boost/math/special_functions/gamma.hpp
index 186befb612..41c85936dd 100644
--- a/include/boost/math/special_functions/gamma.hpp
+++ b/include/boost/math/special_functions/gamma.hpp
@@ -1735,10 +1735,50 @@ BOOST_MATH_GPU_ENABLED T gamma_incomplete_imp(T a, T x, bool normalised, bool in
 //
 // Ratios of two gamma functions:
 //
+template <class T, class Policy, class Lanczos>
+BOOST_MATH_GPU_ENABLED T tgamma_delta_ratio_imp_lanczos_final(T z, T delta, const Policy& pol, const Lanczos&)
+{
+   BOOST_MATH_STD_USING
+
+   T zgh = static_cast<T>(z + T(Lanczos::g()) - constants::half<T>());
+   T result;
+   if(z + delta == z)
+   {
+      if (fabs(delta / zgh) < boost::math::tools::epsilon<T>())
+      {
+         // We have:
+         // result = exp((constants::half<T>() - z) * boost::math::log1p(delta / zgh, pol));
+         // 0.5 - z == -z
+         // log1p(delta / zgh) = delta / zgh = delta / z
+         // multiplying we get -delta.
+         result = exp(-delta);
+      }
+      else
+         // from the pow formula below... but this may actually be wrong, we just can't really calculate it :(
+         result = 1;
+   }
+   else
+   {
+      if(fabs(delta) < 10)
+      {
+         result = exp((constants::half<T>() - z) * boost::math::log1p(delta / zgh, pol));
+      }
+      else
+      {
+         result = pow(T(zgh / (zgh + delta)), T(z - constants::half<T>()));
+      }
+      // Split the calculation up to avoid spurious overflow:
+      result *= Lanczos::lanczos_sum(z) / Lanczos::lanczos_sum(T(z + delta));
+   }
+   result *= pow(T(constants::e<T>() / (zgh + delta)), delta);
+   return result;
+}
+
 template <class T, class Policy, class Lanczos>
 BOOST_MATH_GPU_ENABLED T tgamma_delta_ratio_imp_lanczos(T z, T delta, const Policy& pol, const Lanczos& l)
 {
    BOOST_MATH_STD_USING
+
    if(z < tools::epsilon<T>())
    {
       //
@@ -1752,7 +1792,7 @@ BOOST_MATH_GPU_ENABLED T tgamma_delta_ratio_imp_lanczos(T z, T delta, const Poli
       //
       if(boost::math::max_factorial<T>::value < delta)
       {
-         T ratio = tgamma_delta_ratio_imp_lanczos(delta, T(boost::math::max_factorial<T>::value - delta), pol, l);
+         T ratio = tgamma_delta_ratio_imp_lanczos_final(T(delta), T(boost::math::max_factorial<T>::value - delta), pol, l);
          ratio *= z;
          ratio *= boost::math::unchecked_factorial<T>(boost::math::max_factorial<T>::value - 1);
          return 1 / ratio;
@@ -1773,39 +1813,10 @@ BOOST_MATH_GPU_ENABLED T tgamma_delta_ratio_imp_lanczos(T z, T delta, const Poli
          #endif
       }
    }
-   T zgh = static_cast<T>(z + T(Lanczos::g()) - constants::half<T>());
-   T result;
-   if(z + delta == z)
-   {
-      if (fabs(delta / zgh) < boost::math::tools::epsilon<T>())
-      {
-         // We have:
-         // result = exp((constants::half<T>() - z) * boost::math::log1p(delta / zgh, pol));
-         // 0.5 - z == -z
-         // log1p(delta / zgh) = delta / zgh = delta / z
-         // multiplying we get -delta.
-         result = exp(-delta);
-      }
-      else
-         // from the pow formula below... but this may actually be wrong, we just can't really calculate it :(
-         result = 1;
-   }
-   else
-   {
-      if(fabs(delta) < 10)
-      {
-         result = exp((constants::half<T>() - z) * boost::math::log1p(delta / zgh, pol));
-      }
-      else
-      {
-         result = pow(T(zgh / (zgh + delta)), T(z - constants::half<T>()));
-      }
-      // Split the calculation up to avoid spurious overflow:
-      result *= Lanczos::lanczos_sum(z) / Lanczos::lanczos_sum(T(z + delta));
-   }
-   result *= pow(T(constants::e<T>() / (zgh + delta)), delta);
-   return result;
+
+   return tgamma_delta_ratio_imp_lanczos_final(T(z), T(delta), pol, l);
 }
+
 //
 // And again without Lanczos support this time:
 //
diff --git a/include/boost/math/special_functions/math_fwd.hpp b/include/boost/math/special_functions/math_fwd.hpp
index 16ae3b61eb..e3a2722c3d 100644
--- a/include/boost/math/special_functions/math_fwd.hpp
+++ b/include/boost/math/special_functions/math_fwd.hpp
@@ -26,7 +26,19 @@
 
 #include <boost/math/tools/config.hpp>
 
-#ifndef BOOST_MATH_HAS_NVRTC
+#ifdef BOOST_MATH_HAS_NVRTC
+
+namespace boost {
+namespace math {
+
+template <class RT1, class RT2, class A>
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<RT1, RT2, A>::type
+beta(RT1 a, RT2 b, A arg);
+
+} // namespace math
+} // namespace boost
+
+#else
 
 #include <vector>
 #include <complex>
@@ -154,9 +166,9 @@ namespace boost
 
    // Binomial:
    template <class T, class Policy>
-   T binomial_coefficient(unsigned n, unsigned k, const Policy& pol);
+   BOOST_MATH_GPU_ENABLED T binomial_coefficient(unsigned n, unsigned k, const Policy& pol);
    template <class T>
-   T binomial_coefficient(unsigned n, unsigned k);
+   BOOST_MATH_GPU_ENABLED T binomial_coefficient(unsigned n, unsigned k);
 
    // erf & erfc error functions.
    template <class RT> // Error function.
@@ -874,19 +886,19 @@ namespace boost
    BOOST_MATH_GPU_ENABLED tools::promote_args_t<T> cos_pi(T x);
 
    template <class T>
-   int fpclassify BOOST_NO_MACRO_EXPAND(T t);
+   BOOST_MATH_GPU_ENABLED int fpclassify BOOST_NO_MACRO_EXPAND(T t);
 
    template <class T>
-   bool isfinite BOOST_NO_MACRO_EXPAND(T z);
+   BOOST_MATH_GPU_ENABLED bool isfinite BOOST_NO_MACRO_EXPAND(T z);
 
    template <class T>
-   bool isinf BOOST_NO_MACRO_EXPAND(T t);
+   BOOST_MATH_GPU_ENABLED bool isinf BOOST_NO_MACRO_EXPAND(T t);
 
    template <class T>
-   bool isnan BOOST_NO_MACRO_EXPAND(T t);
+   BOOST_MATH_GPU_ENABLED bool isnan BOOST_NO_MACRO_EXPAND(T t);
 
    template <class T>
-   bool isnormal BOOST_NO_MACRO_EXPAND(T t);
+   BOOST_MATH_GPU_ENABLED bool isnormal BOOST_NO_MACRO_EXPAND(T t);
 
    template<class T>
    BOOST_MATH_GPU_ENABLED int signbit BOOST_NO_MACRO_EXPAND(T x);
@@ -1218,62 +1230,62 @@ namespace boost
    BOOST_MATH_DETAIL_11_FUNC(Policy)\
    \
    template <class RT1, class RT2>\
-   inline boost::math::tools::promote_args_t<RT1, RT2> \
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<RT1, RT2> \
    beta(RT1 a, RT2 b) { return ::boost::math::beta(a, b, Policy()); }\
 \
    template <class RT1, class RT2, class A>\
-   inline boost::math::tools::promote_args_t<RT1, RT2, A> \
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<RT1, RT2, A> \
    beta(RT1 a, RT2 b, A x){ return ::boost::math::beta(a, b, x, Policy()); }\
 \
    template <class RT1, class RT2, class RT3>\
-   inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
    betac(RT1 a, RT2 b, RT3 x) { return ::boost::math::betac(a, b, x, Policy()); }\
 \
    template <class RT1, class RT2, class RT3>\
-   inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
    ibeta(RT1 a, RT2 b, RT3 x){ return ::boost::math::ibeta(a, b, x, Policy()); }\
 \
    template <class RT1, class RT2, class RT3>\
-   inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
    ibetac(RT1 a, RT2 b, RT3 x){ return ::boost::math::ibetac(a, b, x, Policy()); }\
 \
    template <class T1, class T2, class T3, class T4>\
-   inline boost::math::tools::promote_args_t<T1, T2, T3, T4>  \
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<T1, T2, T3, T4>  \
    ibeta_inv(T1 a, T2 b, T3 p, T4* py){ return ::boost::math::ibeta_inv(a, b, p, py, Policy()); }\
 \
    template <class RT1, class RT2, class RT3>\
-   inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
    ibeta_inv(RT1 a, RT2 b, RT3 p){ return ::boost::math::ibeta_inv(a, b, p, Policy()); }\
 \
    template <class T1, class T2, class T3, class T4>\
-   inline boost::math::tools::promote_args_t<T1, T2, T3, T4> \
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<T1, T2, T3, T4> \
    ibetac_inv(T1 a, T2 b, T3 q, T4* py){ return ::boost::math::ibetac_inv(a, b, q, py, Policy()); }\
 \
    template <class RT1, class RT2, class RT3>\
-   inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
    ibeta_inva(RT1 a, RT2 b, RT3 p){ return ::boost::math::ibeta_inva(a, b, p, Policy()); }\
 \
    template <class T1, class T2, class T3>\
-   inline boost::math::tools::promote_args_t<T1, T2, T3> \
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<T1, T2, T3> \
    ibetac_inva(T1 a, T2 b, T3 q){ return ::boost::math::ibetac_inva(a, b, q, Policy()); }\
 \
    template <class RT1, class RT2, class RT3>\
-   inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
    ibeta_invb(RT1 a, RT2 b, RT3 p){ return ::boost::math::ibeta_invb(a, b, p, Policy()); }\
 \
    template <class T1, class T2, class T3>\
-   inline boost::math::tools::promote_args_t<T1, T2, T3> \
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<T1, T2, T3> \
    ibetac_invb(T1 a, T2 b, T3 q){ return ::boost::math::ibetac_invb(a, b, q, Policy()); }\
 \
    template <class RT1, class RT2, class RT3>\
-   inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
    ibetac_inv(RT1 a, RT2 b, RT3 q){ return ::boost::math::ibetac_inv(a, b, q, Policy()); }\
 \
    template <class RT1, class RT2, class RT3>\
-   inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
    ibeta_derivative(RT1 a, RT2 b, RT3 x){ return ::boost::math::ibeta_derivative(a, b, x, Policy()); }\
 \
-   template <class T> T binomial_coefficient(unsigned n, unsigned k){ return ::boost::math::binomial_coefficient<T, Policy>(n, k, Policy()); }\
+   template <class T> BOOST_MATH_GPU_ENABLED T binomial_coefficient(unsigned n, unsigned k){ return ::boost::math::binomial_coefficient<T, Policy>(n, k, Policy()); }\
 \
    template <class RT>\
    BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<RT> erf(RT z) { return ::boost::math::erf(z, Policy()); }\
diff --git a/include/boost/math/tools/config.hpp b/include/boost/math/tools/config.hpp
index a2af127630..fac8f84db0 100644
--- a/include/boost/math/tools/config.hpp
+++ b/include/boost/math/tools/config.hpp
@@ -800,7 +800,7 @@ BOOST_MATH_GPU_ENABLED constexpr T gpu_safe_max(const T& a, const T& b) { return
 #define BOOST_MATH_BIG_CONSTANT(T, N, V) static_cast<T>(V)
 #define BOOST_MATH_FORCEINLINE __forceinline__
 #define BOOST_MATH_STD_USING  
-#define BOOST_MATH_IF_CONSTEXPR if constexpr
+#define BOOST_MATH_IF_CONSTEXPR if
 #define BOOST_MATH_IS_FLOAT(T) (boost::math::is_floating_point<T>::value)
 #define BOOST_MATH_CONSTEXPR_TABLE_FUNCTION constexpr
 #define BOOST_MATH_NO_EXCEPTIONS
diff --git a/test/cuda_jamfile b/test/cuda_jamfile
index 1fb55da197..c9a70e8a99 100644
--- a/test/cuda_jamfile
+++ b/test/cuda_jamfile
@@ -25,6 +25,13 @@ run test_bernoulli_pdf_float.cu ;
 run test_bernoulli_range_support_double.cu ;
 run test_bernoulli_range_support_float.cu ;
 
+run test_beta_dist_cdf_double.cu ;
+run test_beta_dist_cdf_float.cu ;
+run test_beta_dist_pdf_double.cu ;
+run test_beta_dist_pdf_float.cu ;
+run test_beta_dist_quan_double.cu ;
+run test_beta_dist_quan_float.cu ;
+
 run test_cauchy_cdf_double.cu ;
 run test_cauchy_cdf_float.cu ;
 run test_cauchy_pdf_double.cu ;
@@ -107,6 +114,24 @@ run test_weibull_quan_float.cu ;
 # Special Functions
 run test_beta_double.cu ;
 run test_beta_float.cu ;
+run test_betac_double.cu ;
+run test_betac_float.cu ;
+run test_ibeta_double.cu ;
+run test_ibeta_float.cu ;
+run test_ibeta_derivative_double.cu ;
+run test_ibeta_derivative_float.cu ;
+run test_ibeta_inv_double.cu ;
+run test_ibeta_inv_float.cu ;
+run test_ibeta_inva_double.cu ;
+run test_ibeta_inva_float.cu ;
+run test_ibeta_invb_double.cu ;
+run test_ibeta_invb_float.cu ;
+run test_ibetac_inv_double.cu ;
+run test_ibetac_inv_float.cu ;
+run test_ibetac_inva_double.cu ;
+run test_ibetac_inva_float.cu ;
+run test_ibetac_invb_double.cu ;
+run test_ibetac_invb_float.cu ;
 
 run test_bessel_i0_double.cu ;
 run test_bessel_i0_float.cu ;
diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile
index 4a37960a51..1fc2746a1f 100644
--- a/test/nvrtc_jamfile
+++ b/test/nvrtc_jamfile
@@ -24,6 +24,13 @@ run test_bernoulli_pdf_nvrtc_float.cpp ;
 run test_bernoulli_quan_nvrtc_double.cpp ;
 run test_bernoulli_quan_nvrtc_float.cpp ;
 
+run test_beta_dist_cdf_nvrtc_double.cpp ;
+run test_beta_dist_cdf_nvrtc_float.cpp ;
+run test_beta_dist_pdf_nvrtc_double.cpp ;
+run test_beta_dist_pdf_nvrtc_float.cpp ;
+run test_beta_dist_quan_nvrtc_double.cpp ;
+run test_beta_dist_quan_nvrtc_float.cpp ;
+
 run test_cauchy_cdf_nvrtc_double.cpp ;
 run test_cauchy_cdf_nvrtc_float.cpp ;
 run test_cauchy_pdf_nvrtc_double.cpp ;
@@ -104,6 +111,26 @@ run test_weibull_quan_nvrtc_float.cpp ;
 # Special Functions
 run test_beta_nvrtc_double.cpp ;
 run test_beta_nvrtc_float.cpp ;
+run test_betac_nvrtc_double.cpp ;
+run test_betac_nvrtc_float.cpp ;
+run test_ibeta_nvrtc_double.cpp ;
+run test_ibeta_nvrtc_float.cpp ;
+run test_ibetac_nvrtc_double.cpp ;
+run test_ibetac_nvrtc_float.cpp ;
+run test_ibeta_derivative_nvrtc_double.cpp ;
+run test_ibeta_derivative_nvrtc_float.cpp ;
+run test_ibeta_inv_nvrtc_double.cpp ;
+run test_ibeta_inv_nvrtc_float.cpp ;
+run test_ibeta_inva_nvrtc_double.cpp ;
+run test_ibeta_inva_nvrtc_float.cpp ;
+run test_ibeta_invb_nvrtc_double.cpp ;
+run test_ibeta_invb_nvrtc_float.cpp ;
+run test_ibetac_inv_nvrtc_double.cpp ;
+run test_ibetac_inv_nvrtc_float.cpp ;
+run test_ibetac_inva_nvrtc_double.cpp ;
+run test_ibetac_inva_nvrtc_float.cpp ;
+run test_ibetac_invb_nvrtc_double.cpp ;
+run test_ibetac_invb_nvrtc_float.cpp ;
 
 run test_bessel_i0_nvrtc_double.cpp ;
 run test_bessel_i0_nvrtc_float.cpp ;
diff --git a/test/sycl_jamfile b/test/sycl_jamfile
index 2fd5954ae1..5d3d85cd8f 100644
--- a/test/sycl_jamfile
+++ b/test/sycl_jamfile
@@ -12,6 +12,7 @@ project : requirements
 # Distributions
 run test_arcsine.cpp ;
 run test_bernoulli.cpp ;
+run test_beta_dist.cpp ;
 run test_cauchy.cpp ;
 run test_chi_squared.cpp ;
 run test_exponential_dist.cpp ;
@@ -28,6 +29,10 @@ run test_weibull.cpp ;
 run pow_test.cpp ;
 
 run test_beta_simple.cpp ;
+run test_beta.cpp ;
+run test_ibeta.cpp ;
+run test_ibeta_inv.cpp ;
+run test_ibeta_inv_ab.cpp ;
 
 run test_bessel_i.cpp ;
 run test_bessel_j.cpp ;
diff --git a/test/test_bessel_i.cpp b/test/test_bessel_i.cpp
index 09487ddf1b..817569760a 100644
--- a/test/test_bessel_i.cpp
+++ b/test/test_bessel_i.cpp
@@ -15,7 +15,7 @@
 #  pragma clang diagnostic ignored "-Wliteral-range"
 #elif defined(__GNUC__)
 #  pragma GCC diagnostic push 
-#  pragma GCC diagnostic ignored "-Wliteral-range"
+#  pragma GCC diagnostic ignored "-Woverflow"
 #endif
 
 #include "test_bessel_i.hpp"
diff --git a/test/test_bessel_j.cpp b/test/test_bessel_j.cpp
index 31a64bc579..1dd63a68a5 100644
--- a/test/test_bessel_j.cpp
+++ b/test/test_bessel_j.cpp
@@ -15,7 +15,7 @@
 #  pragma clang diagnostic ignored "-Wliteral-range"
 #elif defined(__GNUC__)
 #  pragma GCC diagnostic push 
-#  pragma GCC diagnostic ignored "-Wliteral-range"
+#  pragma GCC diagnostic ignored "-Woverflow"
 #endif
 
 #include "test_bessel_j.hpp"
diff --git a/test/test_bessel_k.cpp b/test/test_bessel_k.cpp
index 84ba0830f2..6c31f5ab05 100644
--- a/test/test_bessel_k.cpp
+++ b/test/test_bessel_k.cpp
@@ -22,7 +22,7 @@
 #  pragma clang diagnostic ignored "-Wliteral-range"
 #elif defined(__GNUC__)
 #  pragma GCC diagnostic push 
-#  pragma GCC diagnostic ignored "-Wliteral-range"
+#  pragma GCC diagnostic ignored "-Woverflow"
 #endif
 
 #include "test_bessel_k.hpp"
diff --git a/test/test_bessel_y.cpp b/test/test_bessel_y.cpp
index 232a903963..8251920c5b 100644
--- a/test/test_bessel_y.cpp
+++ b/test/test_bessel_y.cpp
@@ -15,7 +15,7 @@
 #  pragma clang diagnostic ignored "-Wliteral-range"
 #elif defined(__GNUC__)
 #  pragma GCC diagnostic push 
-#  pragma GCC diagnostic ignored "-Wliteral-range"
+#  pragma GCC diagnostic ignored "-Woverflow"
 #endif
 
 #include "test_bessel_y.hpp"
diff --git a/test/test_beta.cpp b/test/test_beta.cpp
index b24cb32c07..4e27b71353 100644
--- a/test/test_beta.cpp
+++ b/test/test_beta.cpp
@@ -5,7 +5,17 @@
 //  Boost Software License, Version 1.0. (See accompanying file
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
-#include "pch_light.hpp"
+#ifndef SYCL_LANGUAGE_VERSION
+#include <pch_light.hpp>
+#endif
+
+#ifdef __clang__
+#  pragma clang diagnostic push 
+#  pragma clang diagnostic ignored "-Wliteral-range"
+#elif defined(__GNUC__)
+#  pragma GCC diagnostic push 
+#  pragma GCC diagnostic ignored "-Woverflow"
+#endif
 
 #include "test_beta.hpp"
 
diff --git a/test/test_beta.hpp b/test/test_beta.hpp
index e633935a3c..3019c17e71 100644
--- a/test/test_beta.hpp
+++ b/test/test_beta.hpp
@@ -18,9 +18,10 @@
 #define BOOST_TEST_MAIN
 #include <boost/test/unit_test.hpp>
 #include <boost/test/tools/floating_point_comparison.hpp>
+#include <boost/math/special_functions/beta.hpp>
 #include <boost/math/special_functions/math_fwd.hpp>
 #include <boost/math/tools/stats.hpp>
-#include <boost/math/tools/test.hpp>
+#include "../include_private/boost/math/tools/test.hpp"
 #include <boost/math/constants/constants.hpp>
 #include <boost/type_traits/is_floating_point.hpp>
 #include <boost/array.hpp>
@@ -109,9 +110,12 @@ void test_spots(T)
       // Inexact input, so disable for ultra precise long doubles:
       BOOST_CHECK_CLOSE(::boost::math::beta(static_cast<T>(0.0125L), static_cast<T>(0.000023L)), static_cast<T>(43558.24045647538375006349016083320744662L), tolerance * 2);
    }
+
+   #ifndef BOOST_MATH_NO_EXCEPTIONS
    BOOST_CHECK_THROW(boost::math::beta(static_cast<T>(0), static_cast<T>(1)), std::domain_error);
    BOOST_CHECK_THROW(boost::math::beta(static_cast<T>(-1), static_cast<T>(1)), std::domain_error);
    BOOST_CHECK_THROW(boost::math::beta(static_cast<T>(1), static_cast<T>(-1)), std::domain_error);
    BOOST_CHECK_THROW(boost::math::beta(static_cast<T>(1), static_cast<T>(0)), std::domain_error);
+   #endif
 }
 
diff --git a/test/test_beta_dist.cpp b/test/test_beta_dist.cpp
index 943718a39f..1652309eb7 100644
--- a/test/test_beta_dist.cpp
+++ b/test/test_beta_dist.cpp
@@ -32,9 +32,14 @@
 # pragma warning (disable : 4224) // nonstandard extension used : formal parameter 'arg' was previously defined as a type.
 #endif
 
+#include <boost/math/tools/config.hpp>
+
+#ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS
 #include <boost/math/concepts/real_concept.hpp> // for real_concept
 using ::boost::math::concepts::real_concept;
-#include <boost/math/tools/test.hpp>
+#endif
+
+#include "../include_private/boost/math/tools/test.hpp"
 
 #include <boost/math/distributions/beta.hpp> // for beta_distribution
 using boost::math::beta_distribution;
@@ -634,12 +639,13 @@ BOOST_AUTO_TEST_CASE( test_main )
    BOOST_CHECK_CLOSE_FRACTION(mybeta22.find_alpha(mybeta22.beta(), 0.8, cdf(mybeta22, 0.8)), mybeta22.alpha(), tol);
    BOOST_CHECK_CLOSE_FRACTION(mybeta22.find_beta(mybeta22.alpha(), 0.8, cdf(mybeta22, 0.8)), mybeta22.beta(), tol);
 
-
+   #ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS
    beta_distribution<real_concept> rcbeta22(2, 2); // Using RealType real_concept.
    cout << "numeric_limits<real_concept>::is_specialized " << numeric_limits<real_concept>::is_specialized << endl;
    cout << "numeric_limits<real_concept>::digits " << numeric_limits<real_concept>::digits << endl;
    cout << "numeric_limits<real_concept>::digits10 " << numeric_limits<real_concept>::digits10 << endl;
    cout << "numeric_limits<real_concept>::epsilon " << numeric_limits<real_concept>::epsilon() << endl;
+   #endif
 
    // (Parameter value, arbitrarily zero, only communicates the floating point type).
    test_spots(0.0F); // Test float.
diff --git a/test/test_beta_dist_cdf_double.cu b/test/test_beta_dist_cdf_double.cu
new file mode 100644
index 0000000000..fa460244a3
--- /dev/null
+++ b/test/test_beta_dist_cdf_double.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::beta_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::beta_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/test/test_beta_dist_cdf_float.cu b/test/test_beta_dist_cdf_float.cu
new file mode 100644
index 0000000000..321c844205
--- /dev/null
+++ b/test/test_beta_dist_cdf_float.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::beta_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::beta_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/test/test_beta_dist_cdf_nvrtc_double.cpp b/test/test_beta_dist_cdf_nvrtc_double.cpp
new file mode 100644
index 0000000000..4f5913c108
--- /dev/null
+++ b/test/test_beta_dist_cdf_nvrtc_double.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/beta.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/beta.hpp>
+extern "C" __global__ 
+void test_beta_dist_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::beta_distribution<float_type>(), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_beta_dist_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_beta_dist_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_beta_dist_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = cdf(boost::math::beta_distribution<float_type>(), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_beta_dist_cdf_nvrtc_float.cpp b/test/test_beta_dist_cdf_nvrtc_float.cpp
new file mode 100644
index 0000000000..f5b031c5a9
--- /dev/null
+++ b/test/test_beta_dist_cdf_nvrtc_float.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/beta.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/beta.hpp>
+extern "C" __global__ 
+void test_beta_dist_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::beta_distribution<float_type>(), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_beta_dist_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_beta_dist_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_beta_dist_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = cdf(boost::math::beta_distribution<float_type>(), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_beta_dist_pdf_double.cu b/test/test_beta_dist_pdf_double.cu
new file mode 100644
index 0000000000..c0ee9272ae
--- /dev/null
+++ b/test/test_beta_dist_pdf_double.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::beta_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::beta_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/test/test_beta_dist_pdf_float.cu b/test/test_beta_dist_pdf_float.cu
new file mode 100644
index 0000000000..75e4fa27b4
--- /dev/null
+++ b/test/test_beta_dist_pdf_float.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::beta_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::beta_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/test/test_beta_dist_pdf_nvrtc_double.cpp b/test/test_beta_dist_pdf_nvrtc_double.cpp
new file mode 100644
index 0000000000..c9870e2ce4
--- /dev/null
+++ b/test/test_beta_dist_pdf_nvrtc_double.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/beta.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/beta.hpp>
+extern "C" __global__ 
+void test_beta_dist_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::beta_distribution<float_type>(), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_beta_dist_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_beta_dist_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_beta_dist_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = pdf(boost::math::beta_distribution<float_type>(), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_beta_dist_pdf_nvrtc_float.cpp b/test/test_beta_dist_pdf_nvrtc_float.cpp
new file mode 100644
index 0000000000..0b4fd83488
--- /dev/null
+++ b/test/test_beta_dist_pdf_nvrtc_float.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/beta.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/beta.hpp>
+extern "C" __global__ 
+void test_beta_dist_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::beta_distribution<float_type>(), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_beta_dist_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_beta_dist_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_beta_dist_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = pdf(boost::math::beta_distribution<float_type>(), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_beta_dist_quan_double.cu b/test/test_beta_dist_quan_double.cu
new file mode 100644
index 0000000000..101526afae
--- /dev/null
+++ b/test/test_beta_dist_quan_double.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::beta_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::beta_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/test/test_beta_dist_quan_float.cu b/test/test_beta_dist_quan_float.cu
new file mode 100644
index 0000000000..77696c6393
--- /dev/null
+++ b/test/test_beta_dist_quan_float.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::beta_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::beta_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/test/test_beta_dist_quan_nvrtc_double.cpp b/test/test_beta_dist_quan_nvrtc_double.cpp
new file mode 100644
index 0000000000..9726bf019e
--- /dev/null
+++ b/test/test_beta_dist_quan_nvrtc_double.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/beta.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/beta.hpp>
+extern "C" __global__ 
+void test_beta_dist_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::beta_distribution<float_type>(), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_beta_dist_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_beta_dist_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_beta_dist_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = quantile(boost::math::beta_distribution<float_type>(), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_beta_dist_quan_nvrtc_float.cpp b/test/test_beta_dist_quan_nvrtc_float.cpp
new file mode 100644
index 0000000000..d2476cb2ac
--- /dev/null
+++ b/test/test_beta_dist_quan_nvrtc_float.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/beta.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/beta.hpp>
+extern "C" __global__ 
+void test_beta_dist_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::beta_distribution<float_type>(), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_beta_dist_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_beta_dist_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_beta_dist_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = quantile(boost::math::beta_distribution<float_type>(), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_betac_double.cu b/test/test_betac_double.cu
new file mode 100644
index 0000000000..8bb31d3219
--- /dev/null
+++ b/test/test_betac_double.cu
@@ -0,0 +1,146 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::betac(in1[i], in2[i], in3[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "beta_med_data.ipp"
+#include "beta_small_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v1, v2, v3;
+
+    for(unsigned i = 0; i < beta_med_data.size(); ++i)
+    {
+       v1.push_back(beta_med_data[i][0]);
+       v2.push_back(beta_med_data[i][1]);
+       v3.push_back(beta_med_data[i][2]);
+    }
+    for(unsigned i = 0; i < beta_small_data.size(); ++i)
+    {
+       v1.push_back(beta_small_data[i][0]);
+       v2.push_back(beta_small_data[i][1]);
+       v3.push_back(beta_small_data[i][2]);
+    }
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+    cuda_managed_ptr<float_type> input_vector3(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+        input_vector3[i] = v3[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::betac(input_vector1[i], input_vector2[i], input_vector3[i]));
+    double t = w.elapsed();
+    bool failed = false;
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::isfinite(output_vector[i]))
+        {
+            if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+            {
+                std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+                std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+                failed = true;
+            }
+        }
+    }
+
+    if (failed)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
+
diff --git a/test/test_betac_float.cu b/test/test_betac_float.cu
new file mode 100644
index 0000000000..7070c567cc
--- /dev/null
+++ b/test/test_betac_float.cu
@@ -0,0 +1,146 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::betac(in1[i], in2[i], in3[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "beta_med_data.ipp"
+#include "beta_small_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v1, v2, v3;
+
+    for(unsigned i = 0; i < beta_med_data.size(); ++i)
+    {
+       v1.push_back(beta_med_data[i][0]);
+       v2.push_back(beta_med_data[i][1]);
+       v3.push_back(beta_med_data[i][2]);
+    }
+    for(unsigned i = 0; i < beta_small_data.size(); ++i)
+    {
+       v1.push_back(beta_small_data[i][0]);
+       v2.push_back(beta_small_data[i][1]);
+       v3.push_back(beta_small_data[i][2]);
+    }
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+    cuda_managed_ptr<float_type> input_vector3(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+        input_vector3[i] = v3[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::betac(input_vector1[i], input_vector2[i], input_vector3[i]));
+    double t = w.elapsed();
+    bool failed = false;
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::isfinite(output_vector[i]))
+        {
+            if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+            {
+                std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+                std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+                failed = true;
+            }
+        }
+    }
+
+    if (failed)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
+
diff --git a/test/test_betac_nvrtc_double.cpp b/test/test_betac_nvrtc_double.cpp
new file mode 100644
index 0000000000..0667cfe0d4
--- /dev/null
+++ b/test/test_betac_nvrtc_double.cpp
@@ -0,0 +1,196 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/beta.hpp>
+extern "C" __global__ 
+void test_beta_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::betac(in1[i], in2[i], in3[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_beta_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_beta_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_beta_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_in3, *h_out;
+        float_type *d_in1, *d_in2, *d_in3, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_in3 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+            h_in3[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+        checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::betac(h_in1[i], h_in2[i], h_in3[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_in3);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_in3;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_betac_nvrtc_float.cpp b/test/test_betac_nvrtc_float.cpp
new file mode 100644
index 0000000000..0667cfe0d4
--- /dev/null
+++ b/test/test_betac_nvrtc_float.cpp
@@ -0,0 +1,196 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/beta.hpp>
+extern "C" __global__ 
+void test_beta_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::betac(in1[i], in2[i], in3[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_beta_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_beta_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_beta_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_in3, *h_out;
+        float_type *d_in1, *d_in2, *d_in3, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_in3 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+            h_in3[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+        checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::betac(h_in1[i], h_in2[i], h_in3[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_in3);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_in3;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_erf.cpp b/test/test_erf.cpp
index 5044847114..2232c1c759 100644
--- a/test/test_erf.cpp
+++ b/test/test_erf.cpp
@@ -13,7 +13,7 @@
 #  pragma clang diagnostic ignored "-Wliteral-range"
 #elif defined(__GNUC__)
 #  pragma GCC diagnostic push 
-#  pragma GCC diagnostic ignored "-Wliteral-range"
+#  pragma GCC diagnostic ignored "-Woverflow"
 #endif
 
 #define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
diff --git a/test/test_holtsmark.cpp b/test/test_holtsmark.cpp
index 475f5400aa..93a40924d6 100644
--- a/test/test_holtsmark.cpp
+++ b/test/test_holtsmark.cpp
@@ -21,7 +21,7 @@
 #  pragma clang diagnostic ignored "-Wliteral-range"
 #elif defined(__GNUC__)
 #  pragma GCC diagnostic push 
-#  pragma GCC diagnostic ignored "-Wliteral-range"
+#  pragma GCC diagnostic ignored "-Woverflow"
 #endif
 
 using boost::math::holtsmark_distribution;
diff --git a/test/test_ibeta.cpp b/test/test_ibeta.cpp
index e026ac6c52..987b361105 100644
--- a/test/test_ibeta.cpp
+++ b/test/test_ibeta.cpp
@@ -3,7 +3,18 @@
 //  Boost Software License, Version 1.0. (See accompanying file
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
+#ifndef SYCL_LANGUAGE_VERSION
 #include <pch_light.hpp>
+#endif
+
+#ifdef __clang__
+#  pragma clang diagnostic push 
+#  pragma clang diagnostic ignored "-Wliteral-range"
+#elif defined(__GNUC__)
+#  pragma GCC diagnostic push 
+#  pragma GCC diagnostic ignored "-Woverflow"
+#endif
+
 #include "test_ibeta.hpp"
 
 #if !defined(TEST_FLOAT) && !defined(TEST_DOUBLE) && !defined(TEST_LDOUBLE) && !defined(TEST_REAL_CONCEPT)
diff --git a/test/test_ibeta.hpp b/test/test_ibeta.hpp
index 7c951d614f..cfd5d78cd1 100644
--- a/test/test_ibeta.hpp
+++ b/test/test_ibeta.hpp
@@ -8,9 +8,10 @@
 #define BOOST_TEST_MAIN
 #include <boost/test/unit_test.hpp>
 #include <boost/test/tools/floating_point_comparison.hpp>
+#include <boost/math/special_functions/beta.hpp>
 #include <boost/math/special_functions/math_fwd.hpp>
 #include <boost/math/tools/stats.hpp>
-#include <boost/math/tools/test.hpp>
+#include "../include_private/boost/math/tools/test.hpp"
 #include <boost/math/constants/constants.hpp>
 #include <boost/type_traits/is_floating_point.hpp>
 #include <boost/array.hpp>
diff --git a/test/test_ibeta_derivative.cpp b/test/test_ibeta_derivative.cpp
index c899c94bf5..5d6a312754 100644
--- a/test/test_ibeta_derivative.cpp
+++ b/test/test_ibeta_derivative.cpp
@@ -4,7 +4,7 @@
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 #if defined(__GNUC__) && __GNUC__ <= 12
 #pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wliteral-range"
+#pragma GCC diagnostic ignored "-Woverflow"
 #endif
 #include <pch_light.hpp>
 #include "test_ibeta_derivative.hpp"
diff --git a/test/test_ibeta_derivative_double.cu b/test/test_ibeta_derivative_double.cu
new file mode 100644
index 0000000000..e5f7f340ba
--- /dev/null
+++ b/test/test_ibeta_derivative_double.cu
@@ -0,0 +1,149 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// floating-point value does not fit in required floating-point type
+#pragma nv_diag_suppress 221
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibeta_derivative(in1[i], in2[i], in3[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v1, v2, v3;
+
+    for(unsigned i = 0; i < ibeta_data.size(); ++i)
+    {
+       v1.push_back(ibeta_data[i][0]);
+       v2.push_back(ibeta_data[i][1]);
+       v3.push_back(ibeta_data[i][2]);
+    }
+    for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
+    {
+       v1.push_back(ibeta_small_data[i][0]);
+       v2.push_back(ibeta_small_data[i][1]);
+       v3.push_back(ibeta_small_data[i][2]);
+    }
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+    cuda_managed_ptr<float_type> input_vector3(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+        input_vector3[i] = v3[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::ibeta_derivative(input_vector1[i], input_vector2[i], input_vector3[i]));
+    double t = w.elapsed();
+    bool failed = false;
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::isfinite(output_vector[i]))
+        {
+            if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+            {
+                std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+                std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+                failed = true;
+            }
+        }
+    }
+
+    if (failed)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
+
diff --git a/test/test_ibeta_derivative_float.cu b/test/test_ibeta_derivative_float.cu
new file mode 100644
index 0000000000..36a79665d4
--- /dev/null
+++ b/test/test_ibeta_derivative_float.cu
@@ -0,0 +1,149 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// floating-point value does not fit in required floating-point type
+#pragma nv_diag_suppress 221
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibeta_derivative(in1[i], in2[i], in3[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v1, v2, v3;
+
+    for(unsigned i = 0; i < ibeta_data.size(); ++i)
+    {
+       v1.push_back(ibeta_data[i][0]);
+       v2.push_back(ibeta_data[i][1]);
+       v3.push_back(ibeta_data[i][2]);
+    }
+    for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
+    {
+       v1.push_back(ibeta_small_data[i][0]);
+       v2.push_back(ibeta_small_data[i][1]);
+       v3.push_back(ibeta_small_data[i][2]);
+    }
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+    cuda_managed_ptr<float_type> input_vector3(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+        input_vector3[i] = v3[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::ibeta_derivative(input_vector1[i], input_vector2[i], input_vector3[i]));
+    double t = w.elapsed();
+    bool failed = false;
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::isfinite(output_vector[i]))
+        {
+            if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+            {
+                std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+                std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+                failed = true;
+            }
+        }
+    }
+
+    if (failed)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
+
diff --git a/test/test_ibeta_derivative_nvrtc_double.cpp b/test/test_ibeta_derivative_nvrtc_double.cpp
new file mode 100644
index 0000000000..f15d21db00
--- /dev/null
+++ b/test/test_ibeta_derivative_nvrtc_double.cpp
@@ -0,0 +1,207 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/beta.hpp>
+extern "C" __global__ 
+void test_ibeta_derivative_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibeta_derivative(in1[i], in2[i], in3[i]);
+    }
+}
+)";
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_derivative_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_ibeta_derivative_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_derivative_kernel"), "Failed to get kernel function");
+
+        int numElements = ibeta_data.size() + ibeta_small_data.size();
+        float_type *h_in1, *h_in2, *h_in3, *h_out;
+        float_type *d_in1, *d_in2, *d_in3, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_in3 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        for (int i = 0; i < ibeta_data.size(); ++i) 
+        {
+            h_in1[i] = ibeta_data[i][0];
+            h_in2[i] = ibeta_data[i][1];
+            h_in3[i] = ibeta_data[i][2];
+        }
+        for (int i = 0; i < ibeta_small_data.size(); ++i)
+        {
+            h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
+            h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
+            h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+        checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::ibeta_derivative(h_in1[i], h_in2[i], h_in3[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_in3);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_in3;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_ibeta_derivative_nvrtc_float.cpp b/test/test_ibeta_derivative_nvrtc_float.cpp
new file mode 100644
index 0000000000..17443e0bdc
--- /dev/null
+++ b/test/test_ibeta_derivative_nvrtc_float.cpp
@@ -0,0 +1,207 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/beta.hpp>
+extern "C" __global__ 
+void test_ibeta_derivative_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibeta_derivative(in1[i], in2[i], in3[i]);
+    }
+}
+)";
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_derivative_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_ibeta_derivative_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_derivative_kernel"), "Failed to get kernel function");
+
+        int numElements = ibeta_data.size() + ibeta_small_data.size();
+        float_type *h_in1, *h_in2, *h_in3, *h_out;
+        float_type *d_in1, *d_in2, *d_in3, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_in3 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        for (int i = 0; i < ibeta_data.size(); ++i) 
+        {
+            h_in1[i] = ibeta_data[i][0];
+            h_in2[i] = ibeta_data[i][1];
+            h_in3[i] = ibeta_data[i][2];
+        }
+        for (int i = 0; i < ibeta_small_data.size(); ++i)
+        {
+            h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
+            h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
+            h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+        checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::ibeta_derivative(h_in1[i], h_in2[i], h_in3[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_in3);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_in3;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_ibeta_double.cu b/test/test_ibeta_double.cu
new file mode 100644
index 0000000000..20384bf25f
--- /dev/null
+++ b/test/test_ibeta_double.cu
@@ -0,0 +1,149 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// floating-point value does not fit in required floating-point type
+#pragma nv_diag_suppress 221
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibeta(in1[i], in2[i], in3[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v1, v2, v3;
+
+    for(unsigned i = 0; i < ibeta_data.size(); ++i)
+    {
+       v1.push_back(ibeta_data[i][0]);
+       v2.push_back(ibeta_data[i][1]);
+       v3.push_back(ibeta_data[i][2]);
+    }
+    for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
+    {
+       v1.push_back(ibeta_small_data[i][0]);
+       v2.push_back(ibeta_small_data[i][1]);
+       v3.push_back(ibeta_small_data[i][2]);
+    }
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+    cuda_managed_ptr<float_type> input_vector3(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+        input_vector3[i] = v3[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::ibeta(input_vector1[i], input_vector2[i], input_vector3[i]));
+    double t = w.elapsed();
+    bool failed = false;
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::isfinite(output_vector[i]))
+        {
+            if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+            {
+                std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+                std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+                failed = true;
+            }
+        }
+    }
+
+    if (failed)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
+
diff --git a/test/test_ibeta_float.cu b/test/test_ibeta_float.cu
new file mode 100644
index 0000000000..be17813ee4
--- /dev/null
+++ b/test/test_ibeta_float.cu
@@ -0,0 +1,149 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// floating-point value does not fit in required floating-point type
+#pragma nv_diag_suppress 221
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibeta(in1[i], in2[i], in3[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v1, v2, v3;
+
+    for(unsigned i = 0; i < ibeta_data.size(); ++i)
+    {
+       v1.push_back(ibeta_data[i][0]);
+       v2.push_back(ibeta_data[i][1]);
+       v3.push_back(ibeta_data[i][2]);
+    }
+    for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
+    {
+       v1.push_back(ibeta_small_data[i][0]);
+       v2.push_back(ibeta_small_data[i][1]);
+       v3.push_back(ibeta_small_data[i][2]);
+    }
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+    cuda_managed_ptr<float_type> input_vector3(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+        input_vector3[i] = v3[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::ibeta(input_vector1[i], input_vector2[i], input_vector3[i]));
+    double t = w.elapsed();
+    bool failed = false;
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::isfinite(output_vector[i]))
+        {
+            if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+            {
+                std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+                std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+                failed = true;
+            }
+        }
+    }
+
+    if (failed)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
+
diff --git a/test/test_ibeta_inv.cpp b/test/test_ibeta_inv.cpp
index 218c1625e8..ab1f4267fc 100644
--- a/test/test_ibeta_inv.cpp
+++ b/test/test_ibeta_inv.cpp
@@ -3,7 +3,18 @@
 //  Boost Software License, Version 1.0. (See accompanying file
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
+#ifndef SYCL_LANGUAGE_VERSION
 #include <pch_light.hpp>
+#endif
+
+#ifdef __clang__
+#  pragma clang diagnostic push 
+#  pragma clang diagnostic ignored "-Wliteral-range"
+#elif defined(__GNUC__)
+#  pragma GCC diagnostic push 
+#  pragma GCC diagnostic ignored "-Woverflow"
+#endif
+
 #include"test_ibeta_inv.hpp"
 
 #if !defined(TEST_FLOAT) && !defined(TEST_DOUBLE) && !defined(TEST_LDOUBLE) && !defined(TEST_REAL_CONCEPT)
diff --git a/test/test_ibeta_inv.hpp b/test/test_ibeta_inv.hpp
index ba98901773..fa765b2ef8 100644
--- a/test/test_ibeta_inv.hpp
+++ b/test/test_ibeta_inv.hpp
@@ -8,10 +8,11 @@
 #define BOOST_TEST_MAIN
 #include <boost/test/unit_test.hpp>
 #include <boost/test/tools/floating_point_comparison.hpp>
+#include <boost/math/special_functions/beta.hpp>
 #include <boost/math/special_functions/next.hpp>  // for has_denorm_now
 #include <boost/math/special_functions/math_fwd.hpp>
 #include <boost/math/tools/stats.hpp>
-#include <boost/math/tools/test.hpp>
+#include "../include_private/boost/math/tools/test.hpp"
 #include <boost/math/constants/constants.hpp>
 #include <boost/type_traits/is_floating_point.hpp>
 #include <boost/array.hpp>
@@ -306,6 +307,7 @@ void test_spots(T)
       BOOST_MATH_CHECK_THROW(::boost::math::ibeta_inv(static_cast<T>(2.125), -n, static_cast<T>(0.125)), std::domain_error);
       BOOST_MATH_CHECK_THROW(::boost::math::ibeta_inv(static_cast<T>(2.125), static_cast<T>(1.125), -n), std::domain_error);
    }
+   #ifndef SYCL_LANGUAGE_VERSION
    if (boost::math::detail::has_denorm_now<T>())
    {
       T m = std::numeric_limits<T>::denorm_min();
@@ -317,5 +319,6 @@ void test_spots(T)
       BOOST_CHECK((boost::math::isfinite)(boost::math::ibeta_inv(static_cast<T>(12.125), m, static_cast<T>(0.125))));
       BOOST_CHECK((boost::math::isfinite)(boost::math::ibeta_inv(m, m, static_cast<T>(0.125))));
    }
+   #endif
 }
 
diff --git a/test/test_ibeta_inv_ab.cpp b/test/test_ibeta_inv_ab.cpp
index c1acb2d1ca..fdf735ef1e 100644
--- a/test/test_ibeta_inv_ab.cpp
+++ b/test/test_ibeta_inv_ab.cpp
@@ -3,7 +3,18 @@
 //  Boost Software License, Version 1.0. (See accompanying file
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
+#ifndef SYCL_LANGUAGE_VERSION
 #include <pch_light.hpp>
+#endif
+
+#ifdef __clang__
+#  pragma clang diagnostic push 
+#  pragma clang diagnostic ignored "-Wliteral-range"
+#elif defined(__GNUC__)
+#  pragma GCC diagnostic push 
+#  pragma GCC diagnostic ignored "-Woverflow"
+#endif
+
 #include "test_ibeta_inv_ab.hpp"
 
 #if !defined(TEST_FLOAT) && !defined(TEST_DOUBLE) && !defined(TEST_LDOUBLE) && !defined(TEST_REAL_CONCEPT)
diff --git a/test/test_ibeta_inv_ab.hpp b/test/test_ibeta_inv_ab.hpp
index c378d15287..b91ab5261d 100644
--- a/test/test_ibeta_inv_ab.hpp
+++ b/test/test_ibeta_inv_ab.hpp
@@ -10,9 +10,10 @@
 #define BOOST_TEST_MAIN
 #include <boost/test/unit_test.hpp>
 #include <boost/test/tools/floating_point_comparison.hpp>
+#include <boost/math/special_functions/beta.hpp>
 #include <boost/math/special_functions/math_fwd.hpp>
 #include <boost/math/tools/stats.hpp>
-#include <boost/math/tools/test.hpp>
+#include "../include_private/boost/math/tools/test.hpp"
 #include <boost/math/constants/constants.hpp>
 #include <boost/type_traits/is_floating_point.hpp>
 #include <boost/array.hpp>
diff --git a/test/test_ibeta_inv_double.cu b/test/test_ibeta_inv_double.cu
new file mode 100644
index 0000000000..ef62c5e162
--- /dev/null
+++ b/test/test_ibeta_inv_double.cu
@@ -0,0 +1,149 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// floating-point value does not fit in required floating-point type
+#pragma nv_diag_suppress 221
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibeta_inv(in1[i], in2[i], in3[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v1, v2, v3;
+
+    for(unsigned i = 0; i < ibeta_data.size(); ++i)
+    {
+       v1.push_back(ibeta_data[i][0]);
+       v2.push_back(ibeta_data[i][1]);
+       v3.push_back(ibeta_data[i][2]);
+    }
+    for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
+    {
+       v1.push_back(ibeta_small_data[i][0]);
+       v2.push_back(ibeta_small_data[i][1]);
+       v3.push_back(ibeta_small_data[i][2]);
+    }
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+    cuda_managed_ptr<float_type> input_vector3(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+        input_vector3[i] = v3[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::ibeta_inv(input_vector1[i], input_vector2[i], input_vector3[i]));
+    double t = w.elapsed();
+    bool failed = false;
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::isfinite(output_vector[i]))
+        {
+            if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+            {
+                std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+                std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+                failed = true;
+            }
+        }
+    }
+
+    if (failed)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
+
diff --git a/test/test_ibeta_inv_float.cu b/test/test_ibeta_inv_float.cu
new file mode 100644
index 0000000000..a0d48bfbda
--- /dev/null
+++ b/test/test_ibeta_inv_float.cu
@@ -0,0 +1,149 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// floating-point value does not fit in required floating-point type
+#pragma nv_diag_suppress 221
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibeta_inv(in1[i], in2[i], in3[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v1, v2, v3;
+
+    for(unsigned i = 0; i < ibeta_data.size(); ++i)
+    {
+       v1.push_back(ibeta_data[i][0]);
+       v2.push_back(ibeta_data[i][1]);
+       v3.push_back(ibeta_data[i][2]);
+    }
+    for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
+    {
+       v1.push_back(ibeta_small_data[i][0]);
+       v2.push_back(ibeta_small_data[i][1]);
+       v3.push_back(ibeta_small_data[i][2]);
+    }
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+    cuda_managed_ptr<float_type> input_vector3(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+        input_vector3[i] = v3[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::ibeta_inv(input_vector1[i], input_vector2[i], input_vector3[i]));
+    double t = w.elapsed();
+    bool failed = false;
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::isfinite(output_vector[i]))
+        {
+            if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+            {
+                std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+                std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+                failed = true;
+            }
+        }
+    }
+
+    if (failed)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
+
diff --git a/test/test_ibeta_inv_nvrtc_double.cpp b/test/test_ibeta_inv_nvrtc_double.cpp
new file mode 100644
index 0000000000..2f01012bbe
--- /dev/null
+++ b/test/test_ibeta_inv_nvrtc_double.cpp
@@ -0,0 +1,207 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/beta.hpp>
+extern "C" __global__ 
+void test_ibeta_inv_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibeta_inv(in1[i], in2[i], in3[i]);
+    }
+}
+)";
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_inv_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_ibeta_inv_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_inv_kernel"), "Failed to get kernel function");
+
+        int numElements = ibeta_data.size() + ibeta_small_data.size();
+        float_type *h_in1, *h_in2, *h_in3, *h_out;
+        float_type *d_in1, *d_in2, *d_in3, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_in3 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        for (int i = 0; i < ibeta_data.size(); ++i) 
+        {
+            h_in1[i] = ibeta_data[i][0];
+            h_in2[i] = ibeta_data[i][1];
+            h_in3[i] = ibeta_data[i][2];
+        }
+        for (int i = 0; i < ibeta_small_data.size(); ++i)
+        {
+            h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
+            h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
+            h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+        checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::ibeta_inv(h_in1[i], h_in2[i], h_in3[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_in3);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_in3;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_ibeta_inv_nvrtc_float.cpp b/test/test_ibeta_inv_nvrtc_float.cpp
new file mode 100644
index 0000000000..5d804398cb
--- /dev/null
+++ b/test/test_ibeta_inv_nvrtc_float.cpp
@@ -0,0 +1,207 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/beta.hpp>
+extern "C" __global__ 
+void test_ibeta_inv_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibeta_inv(in1[i], in2[i], in3[i]);
+    }
+}
+)";
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_inv_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_ibeta_inv_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_inv_kernel"), "Failed to get kernel function");
+
+        int numElements = ibeta_data.size() + ibeta_small_data.size();
+        float_type *h_in1, *h_in2, *h_in3, *h_out;
+        float_type *d_in1, *d_in2, *d_in3, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_in3 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        for (int i = 0; i < ibeta_data.size(); ++i) 
+        {
+            h_in1[i] = ibeta_data[i][0];
+            h_in2[i] = ibeta_data[i][1];
+            h_in3[i] = ibeta_data[i][2];
+        }
+        for (int i = 0; i < ibeta_small_data.size(); ++i)
+        {
+            h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
+            h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
+            h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+        checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::ibeta_inv(h_in1[i], h_in2[i], h_in3[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_in3);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_in3;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_ibeta_inva_double.cu b/test/test_ibeta_inva_double.cu
new file mode 100644
index 0000000000..7783eb21bb
--- /dev/null
+++ b/test/test_ibeta_inva_double.cu
@@ -0,0 +1,149 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// floating-point value does not fit in required floating-point type
+#pragma nv_diag_suppress 221
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibeta_inva(in1[i], in2[i], in3[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v1, v2, v3;
+
+    for(unsigned i = 0; i < ibeta_data.size(); ++i)
+    {
+       v1.push_back(ibeta_data[i][0]);
+       v2.push_back(ibeta_data[i][1]);
+       v3.push_back(ibeta_data[i][2]);
+    }
+    for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
+    {
+       v1.push_back(ibeta_small_data[i][0]);
+       v2.push_back(ibeta_small_data[i][1]);
+       v3.push_back(ibeta_small_data[i][2]);
+    }
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+    cuda_managed_ptr<float_type> input_vector3(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+        input_vector3[i] = v3[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::ibeta_inva(input_vector1[i], input_vector2[i], input_vector3[i]));
+    double t = w.elapsed();
+    bool failed = false;
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::isfinite(output_vector[i]))
+        {
+            if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+            {
+                std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+                std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+                failed = true;
+            }
+        }
+    }
+
+    if (failed)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
+
diff --git a/test/test_ibeta_inva_float.cu b/test/test_ibeta_inva_float.cu
new file mode 100644
index 0000000000..ff918f9436
--- /dev/null
+++ b/test/test_ibeta_inva_float.cu
@@ -0,0 +1,149 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// floating-point value does not fit in required floating-point type
+#pragma nv_diag_suppress 221
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibeta_inva(in1[i], in2[i], in3[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v1, v2, v3;
+
+    for(unsigned i = 0; i < ibeta_data.size(); ++i)
+    {
+       v1.push_back(ibeta_data[i][0]);
+       v2.push_back(ibeta_data[i][1]);
+       v3.push_back(ibeta_data[i][2]);
+    }
+    for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
+    {
+       v1.push_back(ibeta_small_data[i][0]);
+       v2.push_back(ibeta_small_data[i][1]);
+       v3.push_back(ibeta_small_data[i][2]);
+    }
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+    cuda_managed_ptr<float_type> input_vector3(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+        input_vector3[i] = v3[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::ibeta_inva(input_vector1[i], input_vector2[i], input_vector3[i]));
+    double t = w.elapsed();
+    bool failed = false;
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::isfinite(output_vector[i]))
+        {
+            if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+            {
+                std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+                std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+                failed = true;
+            }
+        }
+    }
+
+    if (failed)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
+
diff --git a/test/test_ibeta_inva_nvrtc_double.cpp b/test/test_ibeta_inva_nvrtc_double.cpp
new file mode 100644
index 0000000000..a392eaea65
--- /dev/null
+++ b/test/test_ibeta_inva_nvrtc_double.cpp
@@ -0,0 +1,220 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/tools/config.hpp>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/beta.hpp>
+extern "C" __global__ 
+void test_ibeta_inva_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibeta_inva(in1[i], in2[i], in3[i]);
+    }
+}
+)";
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_inva_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_ibeta_inva_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_inva_kernel"), "Failed to get kernel function");
+
+        int numElements = ibeta_data.size() + ibeta_small_data.size();
+        float_type *h_in1, *h_in2, *h_in3, *h_out;
+        float_type *d_in1, *d_in2, *d_in3, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_in3 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        for (int i = 0; i < ibeta_data.size(); ++i) 
+        {
+            h_in1[i] = ibeta_data[i][0];
+            h_in2[i] = ibeta_data[i][1];
+            h_in3[i] = ibeta_data[i][2];
+        }
+        for (int i = 0; i < ibeta_small_data.size(); ++i)
+        {
+            h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
+            h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
+            h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+        checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            // Sometimes the ignore error policy is ignored so the below throws
+            // Rather than terminating we can continue to process through our results array
+            double res;
+            try
+            {
+                res = boost::math::ibeta_inva(h_in1[i], h_in2[i], h_in3[i]);
+            }
+            catch (...)
+            {
+                continue;
+            }
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_in3);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_in3;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_ibeta_inva_nvrtc_float.cpp b/test/test_ibeta_inva_nvrtc_float.cpp
new file mode 100644
index 0000000000..ba5745c321
--- /dev/null
+++ b/test/test_ibeta_inva_nvrtc_float.cpp
@@ -0,0 +1,220 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/tools/config.hpp>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/beta.hpp>
+extern "C" __global__ 
+void test_ibeta_inva_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibeta_inva(in1[i], in2[i], in3[i]);
+    }
+}
+)";
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_inva_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_ibeta_inva_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_inva_kernel"), "Failed to get kernel function");
+
+        int numElements = ibeta_data.size() + ibeta_small_data.size();
+        float_type *h_in1, *h_in2, *h_in3, *h_out;
+        float_type *d_in1, *d_in2, *d_in3, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_in3 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        for (int i = 0; i < ibeta_data.size(); ++i) 
+        {
+            h_in1[i] = ibeta_data[i][0];
+            h_in2[i] = ibeta_data[i][1];
+            h_in3[i] = ibeta_data[i][2];
+        }
+        for (int i = 0; i < ibeta_small_data.size(); ++i)
+        {
+            h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
+            h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
+            h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+        checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            // Sometimes the ignore error policy is ignored so the below throws
+            // Rather than terminating we can continue to process through our results array
+            double res;
+            try
+            {
+                res = boost::math::ibeta_inva(h_in1[i], h_in2[i], h_in3[i]);
+            }
+            catch (...)
+            {
+                continue;
+            }
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_in3);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_in3;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_ibeta_invb_double.cu b/test/test_ibeta_invb_double.cu
new file mode 100644
index 0000000000..562f5349dd
--- /dev/null
+++ b/test/test_ibeta_invb_double.cu
@@ -0,0 +1,149 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// floating-point value does not fit in required floating-point type
+#pragma nv_diag_suppress 221
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibeta_invb(in1[i], in2[i], in3[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v1, v2, v3;
+
+    for(unsigned i = 0; i < ibeta_data.size(); ++i)
+    {
+       v1.push_back(ibeta_data[i][0]);
+       v2.push_back(ibeta_data[i][1]);
+       v3.push_back(ibeta_data[i][2]);
+    }
+    for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
+    {
+       v1.push_back(ibeta_small_data[i][0]);
+       v2.push_back(ibeta_small_data[i][1]);
+       v3.push_back(ibeta_small_data[i][2]);
+    }
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+    cuda_managed_ptr<float_type> input_vector3(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+        input_vector3[i] = v3[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::ibeta_invb(input_vector1[i], input_vector2[i], input_vector3[i]));
+    double t = w.elapsed();
+    bool failed = false;
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::isfinite(output_vector[i]))
+        {
+            if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+            {
+                std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+                std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+                failed = true;
+            }
+        }
+    }
+
+    if (failed)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
+
diff --git a/test/test_ibeta_invb_float.cu b/test/test_ibeta_invb_float.cu
new file mode 100644
index 0000000000..86f5615c36
--- /dev/null
+++ b/test/test_ibeta_invb_float.cu
@@ -0,0 +1,149 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// floating-point value does not fit in required floating-point type
+#pragma nv_diag_suppress 221
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibeta_invb(in1[i], in2[i], in3[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v1, v2, v3;
+
+    for(unsigned i = 0; i < ibeta_data.size(); ++i)
+    {
+       v1.push_back(ibeta_data[i][0]);
+       v2.push_back(ibeta_data[i][1]);
+       v3.push_back(ibeta_data[i][2]);
+    }
+    for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
+    {
+       v1.push_back(ibeta_small_data[i][0]);
+       v2.push_back(ibeta_small_data[i][1]);
+       v3.push_back(ibeta_small_data[i][2]);
+    }
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+    cuda_managed_ptr<float_type> input_vector3(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+        input_vector3[i] = v3[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::ibeta_invb(input_vector1[i], input_vector2[i], input_vector3[i]));
+    double t = w.elapsed();
+    bool failed = false;
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::isfinite(output_vector[i]))
+        {
+            if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+            {
+                std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+                std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+                failed = true;
+            }
+        }
+    }
+
+    if (failed)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
+
diff --git a/test/test_ibeta_invb_nvrtc_double.cpp b/test/test_ibeta_invb_nvrtc_double.cpp
new file mode 100644
index 0000000000..6f046f09f3
--- /dev/null
+++ b/test/test_ibeta_invb_nvrtc_double.cpp
@@ -0,0 +1,220 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/tools/config.hpp>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/beta.hpp>
+extern "C" __global__ 
+void test_ibeta_invb_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibeta_invb(in1[i], in2[i], in3[i]);
+    }
+}
+)";
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_invb_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_ibeta_invb_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_invb_kernel"), "Failed to get kernel function");
+
+        int numElements = ibeta_data.size() + ibeta_small_data.size();
+        float_type *h_in1, *h_in2, *h_in3, *h_out;
+        float_type *d_in1, *d_in2, *d_in3, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_in3 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        for (int i = 0; i < ibeta_data.size(); ++i) 
+        {
+            h_in1[i] = ibeta_data[i][0];
+            h_in2[i] = ibeta_data[i][1];
+            h_in3[i] = ibeta_data[i][2];
+        }
+        for (int i = 0; i < ibeta_small_data.size(); ++i)
+        {
+            h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
+            h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
+            h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+        checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            // Sometimes the ignore error policy is ignored so the below throws
+            // Rather than terminating we can continue to process through our results array
+            double res;
+            try
+            {
+                res = boost::math::ibeta_invb(h_in1[i], h_in2[i], h_in3[i]);
+            }
+            catch (...)
+            {
+                continue;
+            }
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_in3);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_in3;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_ibeta_invb_nvrtc_float.cpp b/test/test_ibeta_invb_nvrtc_float.cpp
new file mode 100644
index 0000000000..f2d17b8447
--- /dev/null
+++ b/test/test_ibeta_invb_nvrtc_float.cpp
@@ -0,0 +1,220 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/tools/config.hpp>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/beta.hpp>
+extern "C" __global__ 
+void test_ibeta_invb_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibeta_invb(in1[i], in2[i], in3[i]);
+    }
+}
+)";
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_invb_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_ibeta_invb_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_invb_kernel"), "Failed to get kernel function");
+
+        int numElements = ibeta_data.size() + ibeta_small_data.size();
+        float_type *h_in1, *h_in2, *h_in3, *h_out;
+        float_type *d_in1, *d_in2, *d_in3, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_in3 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        for (int i = 0; i < ibeta_data.size(); ++i) 
+        {
+            h_in1[i] = ibeta_data[i][0];
+            h_in2[i] = ibeta_data[i][1];
+            h_in3[i] = ibeta_data[i][2];
+        }
+        for (int i = 0; i < ibeta_small_data.size(); ++i)
+        {
+            h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
+            h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
+            h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+        checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            // Sometimes the ignore error policy is ignored so the below throws
+            // Rather than terminating we can continue to process through our results array
+            double res;
+            try
+            {
+                res = boost::math::ibeta_invb(h_in1[i], h_in2[i], h_in3[i]);
+            }
+            catch (...)
+            {
+                continue;
+            }
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_in3);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_in3;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_ibeta_nvrtc_double.cpp b/test/test_ibeta_nvrtc_double.cpp
new file mode 100644
index 0000000000..bc920b6368
--- /dev/null
+++ b/test/test_ibeta_nvrtc_double.cpp
@@ -0,0 +1,207 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/beta.hpp>
+extern "C" __global__ 
+void test_ibeta_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibeta(in1[i], in2[i], in3[i]);
+    }
+}
+)";
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_ibeta_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_kernel"), "Failed to get kernel function");
+
+        int numElements = ibeta_data.size() + ibeta_small_data.size();
+        float_type *h_in1, *h_in2, *h_in3, *h_out;
+        float_type *d_in1, *d_in2, *d_in3, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_in3 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        for (int i = 0; i < ibeta_data.size(); ++i) 
+        {
+            h_in1[i] = ibeta_data[i][0];
+            h_in2[i] = ibeta_data[i][1];
+            h_in3[i] = ibeta_data[i][2];
+        }
+        for (int i = 0; i < ibeta_small_data.size(); ++i)
+        {
+            h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
+            h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
+            h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+        checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::ibeta(h_in1[i], h_in2[i], h_in3[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_in3);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_in3;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_ibeta_nvrtc_float.cpp b/test/test_ibeta_nvrtc_float.cpp
new file mode 100644
index 0000000000..ee15748628
--- /dev/null
+++ b/test/test_ibeta_nvrtc_float.cpp
@@ -0,0 +1,207 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/beta.hpp>
+extern "C" __global__ 
+void test_ibeta_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibeta(in1[i], in2[i], in3[i]);
+    }
+}
+)";
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_ibeta_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_kernel"), "Failed to get kernel function");
+
+        int numElements = ibeta_data.size() + ibeta_small_data.size();
+        float_type *h_in1, *h_in2, *h_in3, *h_out;
+        float_type *d_in1, *d_in2, *d_in3, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_in3 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        for (int i = 0; i < ibeta_data.size(); ++i) 
+        {
+            h_in1[i] = ibeta_data[i][0];
+            h_in2[i] = ibeta_data[i][1];
+            h_in3[i] = ibeta_data[i][2];
+        }
+        for (int i = 0; i < ibeta_small_data.size(); ++i)
+        {
+            h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
+            h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
+            h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+        checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::ibeta(h_in1[i], h_in2[i], h_in3[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_in3);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_in3;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_ibetac_inv_double.cu b/test/test_ibetac_inv_double.cu
new file mode 100644
index 0000000000..a983d16677
--- /dev/null
+++ b/test/test_ibetac_inv_double.cu
@@ -0,0 +1,149 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// floating-point value does not fit in required floating-point type
+#pragma nv_diag_suppress 221
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibetac_inv(in1[i], in2[i], in3[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v1, v2, v3;
+
+    for(unsigned i = 0; i < ibeta_data.size(); ++i)
+    {
+       v1.push_back(ibeta_data[i][0]);
+       v2.push_back(ibeta_data[i][1]);
+       v3.push_back(ibeta_data[i][2]);
+    }
+    for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
+    {
+       v1.push_back(ibeta_small_data[i][0]);
+       v2.push_back(ibeta_small_data[i][1]);
+       v3.push_back(ibeta_small_data[i][2]);
+    }
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+    cuda_managed_ptr<float_type> input_vector3(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+        input_vector3[i] = v3[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::ibetac_inv(input_vector1[i], input_vector2[i], input_vector3[i]));
+    double t = w.elapsed();
+    bool failed = false;
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::isfinite(output_vector[i]))
+        {
+            if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+            {
+                std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+                std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+                failed = true;
+            }
+        }
+    }
+
+    if (failed)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
+
diff --git a/test/test_ibetac_inv_float.cu b/test/test_ibetac_inv_float.cu
new file mode 100644
index 0000000000..94583b45e2
--- /dev/null
+++ b/test/test_ibetac_inv_float.cu
@@ -0,0 +1,149 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// floating-point value does not fit in required floating-point type
+#pragma nv_diag_suppress 221
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibetac_inv(in1[i], in2[i], in3[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v1, v2, v3;
+
+    for(unsigned i = 0; i < ibeta_data.size(); ++i)
+    {
+       v1.push_back(ibeta_data[i][0]);
+       v2.push_back(ibeta_data[i][1]);
+       v3.push_back(ibeta_data[i][2]);
+    }
+    for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
+    {
+       v1.push_back(ibeta_small_data[i][0]);
+       v2.push_back(ibeta_small_data[i][1]);
+       v3.push_back(ibeta_small_data[i][2]);
+    }
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+    cuda_managed_ptr<float_type> input_vector3(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+        input_vector3[i] = v3[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::ibetac_inv(input_vector1[i], input_vector2[i], input_vector3[i]));
+    double t = w.elapsed();
+    bool failed = false;
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::isfinite(output_vector[i]))
+        {
+            if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+            {
+                std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+                std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+                failed = true;
+            }
+        }
+    }
+
+    if (failed)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
+
diff --git a/test/test_ibetac_inv_nvrtc_double.cpp b/test/test_ibetac_inv_nvrtc_double.cpp
new file mode 100644
index 0000000000..a99d53b3cd
--- /dev/null
+++ b/test/test_ibetac_inv_nvrtc_double.cpp
@@ -0,0 +1,207 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/beta.hpp>
+extern "C" __global__ 
+void test_ibetac_inv_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibetac_inv(in1[i], in2[i], in3[i]);
+    }
+}
+)";
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibetac_inv_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_ibetac_inv_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_ibetac_inv_kernel"), "Failed to get kernel function");
+
+        int numElements = ibeta_data.size() + ibeta_small_data.size();
+        float_type *h_in1, *h_in2, *h_in3, *h_out;
+        float_type *d_in1, *d_in2, *d_in3, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_in3 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        for (int i = 0; i < ibeta_data.size(); ++i) 
+        {
+            h_in1[i] = ibeta_data[i][0];
+            h_in2[i] = ibeta_data[i][1];
+            h_in3[i] = ibeta_data[i][2];
+        }
+        for (int i = 0; i < ibeta_small_data.size(); ++i)
+        {
+            h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
+            h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
+            h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+        checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::ibetac_inv(h_in1[i], h_in2[i], h_in3[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_in3);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_in3;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_ibetac_inv_nvrtc_float.cpp b/test/test_ibetac_inv_nvrtc_float.cpp
new file mode 100644
index 0000000000..47e89db4c1
--- /dev/null
+++ b/test/test_ibetac_inv_nvrtc_float.cpp
@@ -0,0 +1,207 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/beta.hpp>
+extern "C" __global__ 
+void test_ibetac_inv_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibetac_inv(in1[i], in2[i], in3[i]);
+    }
+}
+)";
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibetac_inv_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_ibetac_inv_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_ibetac_inv_kernel"), "Failed to get kernel function");
+
+        int numElements = ibeta_data.size() + ibeta_small_data.size();
+        float_type *h_in1, *h_in2, *h_in3, *h_out;
+        float_type *d_in1, *d_in2, *d_in3, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_in3 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        for (int i = 0; i < ibeta_data.size(); ++i) 
+        {
+            h_in1[i] = ibeta_data[i][0];
+            h_in2[i] = ibeta_data[i][1];
+            h_in3[i] = ibeta_data[i][2];
+        }
+        for (int i = 0; i < ibeta_small_data.size(); ++i)
+        {
+            h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
+            h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
+            h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+        checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::ibetac_inv(h_in1[i], h_in2[i], h_in3[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_in3);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_in3;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_ibetac_inva_double.cu b/test/test_ibetac_inva_double.cu
new file mode 100644
index 0000000000..2efbee265d
--- /dev/null
+++ b/test/test_ibetac_inva_double.cu
@@ -0,0 +1,149 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// floating-point value does not fit in required floating-point type
+#pragma nv_diag_suppress 221
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibetac_inva(in1[i], in2[i], in3[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v1, v2, v3;
+
+    for(unsigned i = 0; i < ibeta_data.size(); ++i)
+    {
+       v1.push_back(ibeta_data[i][0]);
+       v2.push_back(ibeta_data[i][1]);
+       v3.push_back(ibeta_data[i][2]);
+    }
+    for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
+    {
+       v1.push_back(ibeta_small_data[i][0]);
+       v2.push_back(ibeta_small_data[i][1]);
+       v3.push_back(ibeta_small_data[i][2]);
+    }
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+    cuda_managed_ptr<float_type> input_vector3(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+        input_vector3[i] = v3[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::ibetac_inva(input_vector1[i], input_vector2[i], input_vector3[i]));
+    double t = w.elapsed();
+    bool failed = false;
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::isfinite(output_vector[i]))
+        {
+            if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+            {
+                std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+                std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+                failed = true;
+            }
+        }
+    }
+
+    if (failed)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
+
diff --git a/test/test_ibetac_inva_float.cu b/test/test_ibetac_inva_float.cu
new file mode 100644
index 0000000000..9bd1a29a07
--- /dev/null
+++ b/test/test_ibetac_inva_float.cu
@@ -0,0 +1,149 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// floating-point value does not fit in required floating-point type
+#pragma nv_diag_suppress 221
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibetac_inva(in1[i], in2[i], in3[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v1, v2, v3;
+
+    for(unsigned i = 0; i < ibeta_data.size(); ++i)
+    {
+       v1.push_back(ibeta_data[i][0]);
+       v2.push_back(ibeta_data[i][1]);
+       v3.push_back(ibeta_data[i][2]);
+    }
+    for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
+    {
+       v1.push_back(ibeta_small_data[i][0]);
+       v2.push_back(ibeta_small_data[i][1]);
+       v3.push_back(ibeta_small_data[i][2]);
+    }
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+    cuda_managed_ptr<float_type> input_vector3(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+        input_vector3[i] = v3[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::ibetac_inva(input_vector1[i], input_vector2[i], input_vector3[i]));
+    double t = w.elapsed();
+    bool failed = false;
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::isfinite(output_vector[i]))
+        {
+            if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+            {
+                std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+                std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+                failed = true;
+            }
+        }
+    }
+
+    if (failed)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
+
diff --git a/test/test_ibetac_inva_nvrtc_double.cpp b/test/test_ibetac_inva_nvrtc_double.cpp
new file mode 100644
index 0000000000..7c7bf992b3
--- /dev/null
+++ b/test/test_ibetac_inva_nvrtc_double.cpp
@@ -0,0 +1,220 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/tools/config.hpp>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/beta.hpp>
+extern "C" __global__ 
+void test_ibetac_inva_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibetac_inva(in1[i], in2[i], in3[i]);
+    }
+}
+)";
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibetac_inva_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_ibetac_inva_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_ibetac_inva_kernel"), "Failed to get kernel function");
+
+        int numElements = ibeta_data.size() + ibeta_small_data.size();
+        float_type *h_in1, *h_in2, *h_in3, *h_out;
+        float_type *d_in1, *d_in2, *d_in3, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_in3 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        for (int i = 0; i < ibeta_data.size(); ++i) 
+        {
+            h_in1[i] = ibeta_data[i][0];
+            h_in2[i] = ibeta_data[i][1];
+            h_in3[i] = ibeta_data[i][2];
+        }
+        for (int i = 0; i < ibeta_small_data.size(); ++i)
+        {
+            h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
+            h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
+            h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+        checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            // Sometimes the ignore error policy is ignored so the below throws
+            // Rather than terminating we can continue to process through our results array
+            double res;
+            try
+            {
+                res = boost::math::ibetac_inva(h_in1[i], h_in2[i], h_in3[i]);
+            }
+            catch (...)
+            {
+                continue;
+            }
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_in3);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_in3;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_ibetac_inva_nvrtc_float.cpp b/test/test_ibetac_inva_nvrtc_float.cpp
new file mode 100644
index 0000000000..c79b8b02f1
--- /dev/null
+++ b/test/test_ibetac_inva_nvrtc_float.cpp
@@ -0,0 +1,220 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/tools/config.hpp>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/beta.hpp>
+extern "C" __global__ 
+void test_ibetac_inva_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibetac_inva(in1[i], in2[i], in3[i]);
+    }
+}
+)";
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibetac_inva_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_ibetac_inva_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_ibetac_inva_kernel"), "Failed to get kernel function");
+
+        int numElements = ibeta_data.size() + ibeta_small_data.size();
+        float_type *h_in1, *h_in2, *h_in3, *h_out;
+        float_type *d_in1, *d_in2, *d_in3, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_in3 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        for (int i = 0; i < ibeta_data.size(); ++i) 
+        {
+            h_in1[i] = ibeta_data[i][0];
+            h_in2[i] = ibeta_data[i][1];
+            h_in3[i] = ibeta_data[i][2];
+        }
+        for (int i = 0; i < ibeta_small_data.size(); ++i)
+        {
+            h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
+            h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
+            h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+        checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            // Sometimes the ignore error policy is ignored so the below throws
+            // Rather than terminating we can continue to process through our results array
+            double res;
+            try
+            {
+                res = boost::math::ibetac_inva(h_in1[i], h_in2[i], h_in3[i]);
+            }
+            catch (...)
+            {
+                continue;
+            }
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_in3);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_in3;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_ibetac_invb_double.cu b/test/test_ibetac_invb_double.cu
new file mode 100644
index 0000000000..fddd655af2
--- /dev/null
+++ b/test/test_ibetac_invb_double.cu
@@ -0,0 +1,149 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// floating-point value does not fit in required floating-point type
+#pragma nv_diag_suppress 221
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibetac_invb(in1[i], in2[i], in3[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v1, v2, v3;
+
+    for(unsigned i = 0; i < ibeta_data.size(); ++i)
+    {
+       v1.push_back(ibeta_data[i][0]);
+       v2.push_back(ibeta_data[i][1]);
+       v3.push_back(ibeta_data[i][2]);
+    }
+    for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
+    {
+       v1.push_back(ibeta_small_data[i][0]);
+       v2.push_back(ibeta_small_data[i][1]);
+       v3.push_back(ibeta_small_data[i][2]);
+    }
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+    cuda_managed_ptr<float_type> input_vector3(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+        input_vector3[i] = v3[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::ibetac_invb(input_vector1[i], input_vector2[i], input_vector3[i]));
+    double t = w.elapsed();
+    bool failed = false;
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::isfinite(output_vector[i]))
+        {
+            if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+            {
+                std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+                std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+                failed = true;
+            }
+        }
+    }
+
+    if (failed)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
+
diff --git a/test/test_ibetac_invb_float.cu b/test/test_ibetac_invb_float.cu
new file mode 100644
index 0000000000..fddd655af2
--- /dev/null
+++ b/test/test_ibetac_invb_float.cu
@@ -0,0 +1,149 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// floating-point value does not fit in required floating-point type
+#pragma nv_diag_suppress 221
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibetac_invb(in1[i], in2[i], in3[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v1, v2, v3;
+
+    for(unsigned i = 0; i < ibeta_data.size(); ++i)
+    {
+       v1.push_back(ibeta_data[i][0]);
+       v2.push_back(ibeta_data[i][1]);
+       v3.push_back(ibeta_data[i][2]);
+    }
+    for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
+    {
+       v1.push_back(ibeta_small_data[i][0]);
+       v2.push_back(ibeta_small_data[i][1]);
+       v3.push_back(ibeta_small_data[i][2]);
+    }
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+    cuda_managed_ptr<float_type> input_vector3(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+        input_vector3[i] = v3[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::ibetac_invb(input_vector1[i], input_vector2[i], input_vector3[i]));
+    double t = w.elapsed();
+    bool failed = false;
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::isfinite(output_vector[i]))
+        {
+            if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+            {
+                std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+                std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+                failed = true;
+            }
+        }
+    }
+
+    if (failed)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
+
diff --git a/test/test_ibetac_invb_nvrtc_double.cpp b/test/test_ibetac_invb_nvrtc_double.cpp
new file mode 100644
index 0000000000..76f6318901
--- /dev/null
+++ b/test/test_ibetac_invb_nvrtc_double.cpp
@@ -0,0 +1,220 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/tools/config.hpp>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/beta.hpp>
+extern "C" __global__ 
+void test_ibetac_invb_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibetac_invb(in1[i], in2[i], in3[i]);
+    }
+}
+)";
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibetac_invb_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_ibetac_invb_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_ibetac_invb_kernel"), "Failed to get kernel function");
+
+        int numElements = ibeta_data.size() + ibeta_small_data.size();
+        float_type *h_in1, *h_in2, *h_in3, *h_out;
+        float_type *d_in1, *d_in2, *d_in3, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_in3 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        for (int i = 0; i < ibeta_data.size(); ++i) 
+        {
+            h_in1[i] = ibeta_data[i][0];
+            h_in2[i] = ibeta_data[i][1];
+            h_in3[i] = ibeta_data[i][2];
+        }
+        for (int i = 0; i < ibeta_small_data.size(); ++i)
+        {
+            h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
+            h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
+            h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+        checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            // Sometimes the ignore error policy is ignored so the below throws
+            // Rather than terminating we can continue to process through our results array
+            double res;
+            try
+            {
+                res = boost::math::ibetac_invb(h_in1[i], h_in2[i], h_in3[i]);
+            }
+            catch (...)
+            {
+                continue;
+            }
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_in3);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_in3;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_ibetac_invb_nvrtc_float.cpp b/test/test_ibetac_invb_nvrtc_float.cpp
new file mode 100644
index 0000000000..48d0a31eec
--- /dev/null
+++ b/test/test_ibetac_invb_nvrtc_float.cpp
@@ -0,0 +1,220 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/tools/config.hpp>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/beta.hpp>
+extern "C" __global__ 
+void test_ibetac_invb_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibetac_invb(in1[i], in2[i], in3[i]);
+    }
+}
+)";
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibetac_invb_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_ibetac_invb_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_ibetac_invb_kernel"), "Failed to get kernel function");
+
+        int numElements = ibeta_data.size() + ibeta_small_data.size();
+        float_type *h_in1, *h_in2, *h_in3, *h_out;
+        float_type *d_in1, *d_in2, *d_in3, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_in3 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        for (int i = 0; i < ibeta_data.size(); ++i) 
+        {
+            h_in1[i] = ibeta_data[i][0];
+            h_in2[i] = ibeta_data[i][1];
+            h_in3[i] = ibeta_data[i][2];
+        }
+        for (int i = 0; i < ibeta_small_data.size(); ++i)
+        {
+            h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
+            h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
+            h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+        checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            // Sometimes the ignore error policy is ignored so the below throws
+            // Rather than terminating we can continue to process through our results array
+            double res;
+            try
+            {
+                res = boost::math::ibetac_invb(h_in1[i], h_in2[i], h_in3[i]);
+            }
+            catch (...)
+            {
+                continue;
+            }
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_in3);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_in3;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_ibetac_nvrtc_double.cpp b/test/test_ibetac_nvrtc_double.cpp
new file mode 100644
index 0000000000..6a59473e18
--- /dev/null
+++ b/test/test_ibetac_nvrtc_double.cpp
@@ -0,0 +1,207 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/beta.hpp>
+extern "C" __global__ 
+void test_ibetac_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibetac(in1[i], in2[i], in3[i]);
+    }
+}
+)";
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibetac_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_ibetac_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_ibetac_kernel"), "Failed to get kernel function");
+
+        int numElements = ibeta_data.size() + ibeta_small_data.size();
+        float_type *h_in1, *h_in2, *h_in3, *h_out;
+        float_type *d_in1, *d_in2, *d_in3, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_in3 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        for (int i = 0; i < ibeta_data.size(); ++i) 
+        {
+            h_in1[i] = ibeta_data[i][0];
+            h_in2[i] = ibeta_data[i][1];
+            h_in3[i] = ibeta_data[i][2];
+        }
+        for (int i = 0; i < ibeta_small_data.size(); ++i)
+        {
+            h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
+            h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
+            h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+        checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::ibetac(h_in1[i], h_in2[i], h_in3[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_in3);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_in3;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_ibetac_nvrtc_float.cpp b/test/test_ibetac_nvrtc_float.cpp
new file mode 100644
index 0000000000..a989191e51
--- /dev/null
+++ b/test/test_ibetac_nvrtc_float.cpp
@@ -0,0 +1,207 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/beta.hpp>
+extern "C" __global__ 
+void test_ibetac_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibetac(in1[i], in2[i], in3[i]);
+    }
+}
+)";
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibetac_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_ibetac_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_ibetac_kernel"), "Failed to get kernel function");
+
+        int numElements = ibeta_data.size() + ibeta_small_data.size();
+        float_type *h_in1, *h_in2, *h_in3, *h_out;
+        float_type *d_in1, *d_in2, *d_in3, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_in3 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        for (int i = 0; i < ibeta_data.size(); ++i) 
+        {
+            h_in1[i] = ibeta_data[i][0];
+            h_in2[i] = ibeta_data[i][1];
+            h_in3[i] = ibeta_data[i][2];
+        }
+        for (int i = 0; i < ibeta_small_data.size(); ++i)
+        {
+            h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
+            h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
+            h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+        checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::ibetac(h_in1[i], h_in2[i], h_in3[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_in3);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_in3;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_igamma.cpp b/test/test_igamma.cpp
index 0ad7019963..6e034f3c60 100644
--- a/test/test_igamma.cpp
+++ b/test/test_igamma.cpp
@@ -12,7 +12,7 @@
 #  pragma clang diagnostic ignored "-Wliteral-range"
 #elif defined(__GNUC__)
 #  pragma GCC diagnostic push 
-#  pragma GCC diagnostic ignored "-Wliteral-range"
+#  pragma GCC diagnostic ignored "-Woverflow"
 #endif
 
 #include "test_igamma.hpp"
diff --git a/test/test_igamma_inv.cpp b/test/test_igamma_inv.cpp
index 17e0bfb54f..80a553427c 100644
--- a/test/test_igamma_inv.cpp
+++ b/test/test_igamma_inv.cpp
@@ -12,7 +12,7 @@
 #  pragma clang diagnostic ignored "-Wliteral-range"
 #elif defined(__GNUC__)
 #  pragma GCC diagnostic push 
-#  pragma GCC diagnostic ignored "-Wliteral-range"
+#  pragma GCC diagnostic ignored "-Woverflow"
 #endif
 #define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
 #include "test_igamma_inv.hpp"
diff --git a/test/test_igamma_inva.cpp b/test/test_igamma_inva.cpp
index 8d0e965962..443ad7bbc6 100644
--- a/test/test_igamma_inva.cpp
+++ b/test/test_igamma_inva.cpp
@@ -12,7 +12,7 @@
 #  pragma clang diagnostic ignored "-Wliteral-range"
 #elif defined(__GNUC__)
 #  pragma GCC diagnostic push 
-#  pragma GCC diagnostic ignored "-Wliteral-range"
+#  pragma GCC diagnostic ignored "-Woverflow"
 #endif
 
 #include "test_igamma_inva.hpp"
diff --git a/test/test_landau.cpp b/test/test_landau.cpp
index 1625b21777..c69c208177 100644
--- a/test/test_landau.cpp
+++ b/test/test_landau.cpp
@@ -20,7 +20,7 @@
 #  pragma clang diagnostic ignored "-Wliteral-range"
 #elif defined(__GNUC__)
 #  pragma GCC diagnostic push 
-#  pragma GCC diagnostic ignored "-Wliteral-range"
+#  pragma GCC diagnostic ignored "-Woverflow"
 #endif
 
 using boost::math::landau_distribution;
diff --git a/test/test_mapairy.cpp b/test/test_mapairy.cpp
index ee8e43bf00..ca3b415d76 100644
--- a/test/test_mapairy.cpp
+++ b/test/test_mapairy.cpp
@@ -9,7 +9,7 @@
 #  pragma clang diagnostic ignored "-Wliteral-range"
 #elif defined(__GNUC__)
 #  pragma GCC diagnostic push 
-#  pragma GCC diagnostic ignored "-Wliteral-range"
+#  pragma GCC diagnostic ignored "-Woverflow"
 #endif
 
 #define BOOST_TEST_MAIN