diff --git a/demos/CUDA/BlackScholes/BlackScholes_kernel.cuh b/demos/CUDA/BlackScholes/BlackScholes_kernel.cuh
index 1db634b4c..aed2ec643 100644
--- a/demos/CUDA/BlackScholes/BlackScholes_kernel.cuh
+++ b/demos/CUDA/BlackScholes/BlackScholes_kernel.cuh
@@ -25,12 +25,6 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/*
- * DISCLAIMER: The following file has been slightly modified to ensure
- * compatibility with Clad. The original file is available in NVIDIA's
- * cuda-samples repository on GitHub.
- */
-
 ////////////////////////////////////////////////////////////////////////////////
 // Polynomial approximation of cumulative normal distribution function
 ////////////////////////////////////////////////////////////////////////////////
@@ -42,9 +36,9 @@ __device__ inline float cndGPU(float d) {
   const float A5 = 1.330274429f;
   const float RSQRT2PI = 0.39894228040143267793994605993438f;
 
-  float K = fdividef(1.0f, (1.0f + 0.2316419f * fabsf(d)));
+  float K = __fdividef(1.0f, (1.0f + 0.2316419f * fabsf(d)));
 
-  float cnd = RSQRT2PI * expf(-0.5f * d * d) *
+  float cnd = RSQRT2PI * __expf(-0.5f * d * d) *
               (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))));
 
   if (d > 0)
@@ -66,15 +60,15 @@ __device__ inline void BlackScholesBodyGPU(float& CallResult, float& PutResult,
   float sqrtT, expRT;
   float d1, d2, CNDD1, CNDD2;
 
-  sqrtT = sqrtf(T);
-  d1 = fdividef(logf(S / X) + (R + 0.5f * V * V) * T, V * sqrtT);
+  sqrtT = __fdividef(1.0F, rsqrtf(T));
+  d1 = __fdividef(__logf(S / X) + (R + 0.5f * V * V) * T, V * sqrtT);
   d2 = d1 - V * sqrtT;
 
   CNDD1 = cndGPU(d1);
   CNDD2 = cndGPU(d2);
 
   // Calculate Call and Put simultaneously
-  expRT = expf(-R * T);
+  expRT = __expf(-R * T);
   CallResult = S * CNDD1 - X * expRT * CNDD2;
   PutResult = X * expRT * (1.0f - CNDD2) - S * (1.0f - CNDD1);
 }
diff --git a/include/clad/Differentiator/BuiltinDerivatives.h b/include/clad/Differentiator/BuiltinDerivatives.h
index b2f6ba3e1..cea607692 100644
--- a/include/clad/Differentiator/BuiltinDerivatives.h
+++ b/include/clad/Differentiator/BuiltinDerivatives.h
@@ -386,37 +386,14 @@ inline void free_pushforward(void* ptr, void* d_ptr) {
 // NOLINTEND(cppcoreguidelines-owning-memory)
 // NOLINTEND(cppcoreguidelines-no-malloc)
 
-CUDA_HOST_DEVICE inline void expf_pullback(float a, float d_y, float* d_a) {
-  *d_a += expf(a) * d_y;
-}
-
 CUDA_HOST_DEVICE inline void fabsf_pullback(float a, float d_y, float* d_a) {
   *d_a += (a >= 0) ? d_y : -d_y;
 }
 
-CUDA_HOST_DEVICE inline void logf_pullback(float a, float d_y, float* d_a) {
-  *d_a += (1.F / a) * d_y;
-}
-
-CUDA_HOST_DEVICE inline void fdividef_pullback(float a, float b, float d_y,
-                                               float* d_a, float* d_b) {
-  *d_a += (1.F / b) * d_y;
-  *d_b += (-a / (b * b)) * d_y;
-}
-
 CUDA_HOST_DEVICE inline void sqrtf_pullback(float a, float d_y, float* d_a) {
   *d_a += (1.F / (2.F * sqrtf(a))) * d_y;
 }
 
-
-#ifdef __CUDACC__
-CUDA_HOST_DEVICE inline void make_float2_pullback(float a, float b, float2 d_y,
-                                                  float* d_a, float* d_b) {
-  *d_a += d_y.x;
-  *d_b += d_y.y;
-}
-#endif
-
 // These are required because C variants of mathematical functions are
 // defined in global namespace.
 using std::abs_pushforward;
diff --git a/include/clad/Differentiator/BuiltinDerivativesCUDA.cuh b/include/clad/Differentiator/BuiltinDerivativesCUDA.cuh
new file mode 100644
index 000000000..9179a7856
--- /dev/null
+++ b/include/clad/Differentiator/BuiltinDerivativesCUDA.cuh
@@ -0,0 +1,32 @@
+#include "clad/Differentiator/CladConfig.h"
+
+namespace clad {
+
+namespace custom_derivatives {
+
+__device__ inline void __expf_pullback(float a, float d_y, float* d_a) {
+  *d_a += expf(a) * d_y;
+}
+
+__device__ inline void __logf_pullback(float a, float d_y, float* d_a) {
+  *d_a += (1.F / a) * d_y;
+}
+
+__device__ inline void __fdividef_pullback(float a, float b, float d_y,
+                                          float* d_a, float* d_b) {
+  *d_a += (1.F / b) * d_y;
+  *d_b += (-a / (b * b)) * d_y;
+}
+
+__device__ inline void rsqrtf_pullback(float a, float d_y, float* d_a) {
+  // Compute the gradient of rsqrt with respect to x
+  *d_a = d_y * (-0.5 * powf(a, -1.5));
+}
+
+__device__ inline void make_float2_pullback(float a, float b, float2 d_y,
+                                           float* d_a, float* d_b) {
+  *d_a += d_y.x;
+  *d_b += d_y.y;
+}
+}
+}
diff --git a/include/clad/Differentiator/Differentiator.h b/include/clad/Differentiator/Differentiator.h
index a4d450aff..50d6cc34e 100644
--- a/include/clad/Differentiator/Differentiator.h
+++ b/include/clad/Differentiator/Differentiator.h
@@ -10,6 +10,9 @@
 #include "Array.h"
 #include "ArrayRef.h"
 #include "BuiltinDerivatives.h"
+#ifdef __CUDACC__
+#include "BuiltinDerivativesCUDA.cuh"
+#endif
 #include "CladConfig.h"
 #include "DynamicGraph.h"
 #include "FunctionTraits.h"