diff --git a/demos/CUDA/BlackScholes/BlackScholes.cu b/demos/CUDA/BlackScholes/BlackScholes.cu
index 0f4ecd44c..73dfe4f2b 100644
--- a/demos/CUDA/BlackScholes/BlackScholes.cu
+++ b/demos/CUDA/BlackScholes/BlackScholes.cu
@@ -31,6 +31,8 @@
  * See supplied whitepaper for more explanations.
  */
 
+#include "clad/Differentiator/Differentiator.h"
+
 #include <helper_functions.h>  // helper functions for string parsing
 #include <helper_cuda.h>  // helper functions CUDA error checking and initialization
 
@@ -71,59 +73,66 @@ const float VOLATILITY = 0.30f;
 ////////////////////////////////////////////////////////////////////////////////
 // Main program
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
-  // Start logs
-  printf("[%s] - Starting...\n", argv[0]);
 
-  //'h_' prefix - CPU (host) memory space
-  float
-      // Results calculated by CPU for reference
-      *h_CallResultCPU,
-      *h_PutResultCPU,
-      // CPU copy of GPU results
-      *h_CallResultGPU, *h_PutResultGPU,
-      // CPU instance of input data
-      *h_StockPrice, *h_OptionStrike, *h_OptionYears;
+void launch(float* h_CallResultCPU, float* h_CallResultGPU,
+            float* h_PutResultCPU, float* h_PutResultGPU, float* h_StockPrice,
+            float* h_OptionStrike, float* h_OptionYears) {
 
   //'d_' prefix - GPU (device) memory space
   float
       // Results calculated by GPU
-      *d_CallResult,
-      *d_PutResult,
+      *d_CallResult = nullptr,
+      *d_PutResult = nullptr,
       // GPU instance of input data
-      *d_StockPrice, *d_OptionStrike, *d_OptionYears;
+          *d_StockPrice = nullptr, *d_OptionStrike = nullptr,
+      *d_OptionYears = nullptr;
+
+  cudaMalloc((void**)&d_CallResult, OPT_SZ);
+  cudaMalloc((void**)&d_PutResult, OPT_SZ);
+  cudaMalloc((void**)&d_StockPrice, OPT_SZ);
+  cudaMalloc((void**)&d_OptionStrike, OPT_SZ);
+  cudaMalloc((void**)&d_OptionYears, OPT_SZ);
 
-  double delta, ref, sum_delta, sum_ref, max_delta, L1norm, gpuTime;
+  // Copy options data to GPU memory for further processing
+  cudaMemcpy(d_StockPrice, h_StockPrice, OPT_SZ, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_OptionStrike, h_OptionStrike, OPT_SZ, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_OptionYears, h_OptionYears, OPT_SZ, cudaMemcpyHostToDevice);
+
+  BlackScholesGPU<<<DIV_UP((OPT_N / 2), 128), 128 /*480, 128*/>>>(
+      (float2*)d_CallResult, (float2*)d_PutResult, (float2*)d_StockPrice,
+      (float2*)d_OptionStrike, (float2*)d_OptionYears, RISKFREE, VOLATILITY,
+      OPT_N);
 
-  StopWatchInterface *hTimer = NULL;
-  int i;
+  // Both call and put is calculated
 
-  findCudaDevice(argc, (const char **)argv);
+  // Read back GPU results to compare them to CPU results
+  cudaMemcpy(h_CallResultGPU, d_CallResult, OPT_SZ, cudaMemcpyDeviceToHost);
+  cudaMemcpy(h_PutResultGPU, d_PutResult, OPT_SZ, cudaMemcpyDeviceToHost);
 
-  sdkCreateTimer(&hTimer);
+  // Calculate options values on CPU
+  BlackScholesCPU(h_CallResultCPU, h_PutResultCPU, h_StockPrice, h_OptionStrike,
+                  h_OptionYears, RISKFREE, VOLATILITY, OPT_N);
 
-  printf("Initializing data...\n");
-  printf("...allocating CPU memory for options.\n");
-  h_CallResultCPU = (float *)malloc(OPT_SZ);
-  h_PutResultCPU = (float *)malloc(OPT_SZ);
-  h_CallResultGPU = (float *)malloc(OPT_SZ);
-  h_PutResultGPU = (float *)malloc(OPT_SZ);
-  h_StockPrice = (float *)malloc(OPT_SZ);
-  h_OptionStrike = (float *)malloc(OPT_SZ);
-  h_OptionYears = (float *)malloc(OPT_SZ);
+  cudaFree(d_OptionYears);
+  cudaFree(d_OptionStrike);
+  cudaFree(d_StockPrice);
+  cudaFree(d_PutResult);
+  cudaFree(d_CallResult);
+}
 
-  printf("...allocating GPU memory for options.\n");
-  checkCudaErrors(cudaMalloc((void **)&d_CallResult, OPT_SZ));
-  checkCudaErrors(cudaMalloc((void **)&d_PutResult, OPT_SZ));
-  checkCudaErrors(cudaMalloc((void **)&d_StockPrice, OPT_SZ));
-  checkCudaErrors(cudaMalloc((void **)&d_OptionStrike, OPT_SZ));
-  checkCudaErrors(cudaMalloc((void **)&d_OptionYears, OPT_SZ));
+int main(int argc, char** argv) {
+  float* h_CallResultCPU = (float*)malloc(OPT_SZ);
+  float* h_PutResultCPU = (float*)malloc(OPT_SZ);
+  float* h_CallResultGPU = (float*)malloc(OPT_SZ);
+  float* h_PutResultGPU = (float*)malloc(OPT_SZ);
+  float* h_StockPrice = (float*)malloc(OPT_SZ);
+  float* h_OptionStrike = (float*)malloc(OPT_SZ);
+  float* h_OptionYears = (float*)malloc(OPT_SZ);
 
-  printf("...generating input data in CPU mem.\n");
   srand(5347);
 
   // Generate options set
-  for (i = 0; i < OPT_N; i++) {
+  for (int i = 0; i < OPT_N; i++) {
     h_CallResultCPU[i] = 0.0f;
     h_PutResultCPU[i] = -1.0f;
     h_StockPrice[i] = RandFloat(5.0f, 30.0f);
@@ -131,93 +140,31 @@ int main(int argc, char **argv) {
     h_OptionYears[i] = RandFloat(0.25f, 10.0f);
   }
 
-  printf("...copying input data to GPU mem.\n");
-  // Copy options data to GPU memory for further processing
-  checkCudaErrors(
-      cudaMemcpy(d_StockPrice, h_StockPrice, OPT_SZ, cudaMemcpyHostToDevice));
-  checkCudaErrors(cudaMemcpy(d_OptionStrike, h_OptionStrike, OPT_SZ,
-                             cudaMemcpyHostToDevice));
-  checkCudaErrors(
-      cudaMemcpy(d_OptionYears, h_OptionYears, OPT_SZ, cudaMemcpyHostToDevice));
-  printf("Data init done.\n\n");
-
-  printf("Executing Black-Scholes GPU kernel (%i iterations)...\n",
-         NUM_ITERATIONS);
-  checkCudaErrors(cudaDeviceSynchronize());
-  sdkResetTimer(&hTimer);
-  sdkStartTimer(&hTimer);
-
-  for (i = 0; i < NUM_ITERATIONS; i++) {
-    BlackScholesGPU<<<DIV_UP((OPT_N / 2), 128), 128 /*480, 128*/>>>(
-        (float2 *)d_CallResult, (float2 *)d_PutResult, (float2 *)d_StockPrice,
-        (float2 *)d_OptionStrike, (float2 *)d_OptionYears, RISKFREE, VOLATILITY,
-        OPT_N);
-    getLastCudaError("BlackScholesGPU() execution failed\n");
-  }
-
-  checkCudaErrors(cudaDeviceSynchronize());
-  sdkStopTimer(&hTimer);
-  gpuTime = sdkGetTimerValue(&hTimer) / NUM_ITERATIONS;
-
-  // Both call and put is calculated
-  printf("Options count             : %i     \n", 2 * OPT_N);
-  printf("BlackScholesGPU() time    : %f msec\n", gpuTime);
-  printf("Effective memory bandwidth: %f GB/s\n",
-         ((double)(5 * OPT_N * sizeof(float)) * 1E-9) / (gpuTime * 1E-3));
-  printf("Gigaoptions per second    : %f     \n\n",
-         ((double)(2 * OPT_N) * 1E-9) / (gpuTime * 1E-3));
-
-  printf(
-      "BlackScholes, Throughput = %.4f GOptions/s, Time = %.5f s, Size = %u "
-      "options, NumDevsUsed = %u, Workgroup = %u\n",
-      (((double)(2.0 * OPT_N) * 1.0E-9) / (gpuTime * 1.0E-3)), gpuTime * 1e-3,
-      (2 * OPT_N), 1, 128);
-
-  printf("\nReading back GPU results...\n");
-  // Read back GPU results to compare them to CPU results
-  checkCudaErrors(cudaMemcpy(h_CallResultGPU, d_CallResult, OPT_SZ,
-                             cudaMemcpyDeviceToHost));
-  checkCudaErrors(
-      cudaMemcpy(h_PutResultGPU, d_PutResult, OPT_SZ, cudaMemcpyDeviceToHost));
+  auto callGrad = clad::gradient(
+      launch, "h_CallResultGPU, h_StockPrice, h_OptionStrike, h_OptionYears");
+  auto putGrad = clad::gradient(
+      launch, "h_PutResultGPU, h_StockPrice, h_OptionStrike, h_OptionYears");
 
-  printf("Checking the results...\n");
-  printf("...running CPU calculations.\n\n");
-  // Calculate options values on CPU
-  BlackScholesCPU(h_CallResultCPU, h_PutResultCPU, h_StockPrice, h_OptionStrike,
-                  h_OptionYears, RISKFREE, VOLATILITY, OPT_N);
-
-  printf("Comparing the results...\n");
   // Calculate max absolute difference and L1 distance
   // between CPU and GPU results
+  float delta, ref, sum_delta, sum_ref, max_delta, L1norm;
   sum_delta = 0;
   sum_ref = 0;
   max_delta = 0;
 
-  for (i = 0; i < OPT_N; i++) {
+  for (int i = 0; i < OPT_N; i++) {
     ref = h_CallResultCPU[i];
-    delta = fabs(h_CallResultCPU[i] - h_CallResultGPU[i]);
+    delta = fabsf(h_CallResultCPU[i] - h_CallResultGPU[i]);
 
-    if (delta > max_delta) {
+    if (delta > max_delta)
       max_delta = delta;
-    }
 
     sum_delta += delta;
-    sum_ref += fabs(ref);
+    sum_ref += fabsf(ref);
   }
 
   L1norm = sum_delta / sum_ref;
-  printf("L1 norm: %E\n", L1norm);
-  printf("Max absolute error: %E\n\n", max_delta);
-
-  printf("Shutting down...\n");
-  printf("...releasing GPU memory.\n");
-  checkCudaErrors(cudaFree(d_OptionYears));
-  checkCudaErrors(cudaFree(d_OptionStrike));
-  checkCudaErrors(cudaFree(d_StockPrice));
-  checkCudaErrors(cudaFree(d_PutResult));
-  checkCudaErrors(cudaFree(d_CallResult));
-
-  printf("...releasing CPU memory.\n");
+
   free(h_OptionYears);
   free(h_OptionStrike);
   free(h_StockPrice);
@@ -225,19 +172,11 @@ int main(int argc, char **argv) {
   free(h_CallResultGPU);
   free(h_PutResultCPU);
   free(h_CallResultCPU);
-  sdkDeleteTimer(&hTimer);
-  printf("Shutdown done.\n");
-
-  printf("\n[BlackScholes] - Test Summary\n");
 
   if (L1norm > 1e-6) {
-    printf("Test failed!\n");
-    exit(EXIT_FAILURE);
+    printf("Original test failed\n");
+    return EXIT_FAILURE;
   }
 
-  printf(
-      "\nNOTE: The CUDA Samples are not meant for performance measurements. "
-      "Results may vary when GPU Boost is enabled.\n\n");
-  printf("Test passed\n");
-  exit(EXIT_SUCCESS);
+  return EXIT_SUCCESS;
 }
diff --git a/demos/CUDA/BlackScholes/BlackScholes_kernel.cuh b/demos/CUDA/BlackScholes/BlackScholes_kernel.cuh
index 9c49d49e4..dfcf5c575 100644
--- a/demos/CUDA/BlackScholes/BlackScholes_kernel.cuh
+++ b/demos/CUDA/BlackScholes/BlackScholes_kernel.cuh
@@ -36,9 +36,9 @@ __device__ inline float cndGPU(float d) {
   const float A5 = 1.330274429f;
   const float RSQRT2PI = 0.39894228040143267793994605993438f;
 
-  float K = __fdividef(1.0f, (1.0f + 0.2316419f * fabsf(d)));
+  float K = fdividef(1.0f, (1.0f + 0.2316419f * fabsf(d)));
 
-  float cnd = RSQRT2PI * __expf(-0.5f * d * d) *
+  float cnd = RSQRT2PI * expf(-0.5f * d * d) *
               (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))));
 
   if (d > 0) cnd = 1.0f - cnd;
@@ -59,15 +59,15 @@ __device__ inline void BlackScholesBodyGPU(float &CallResult, float &PutResult,
   float sqrtT, expRT;
   float d1, d2, CNDD1, CNDD2;
 
-  sqrtT = __fdividef(1.0F, rsqrtf(T));
-  d1 = __fdividef(__logf(S / X) + (R + 0.5f * V * V) * T, V * sqrtT);
+  sqrtT = fdividef(1.0F, 1.0 / sqrtf(T));
+  d1 = fdividef(logf(S / X) + (R + 0.5f * V * V) * T, V * sqrtT);
   d2 = d1 - V * sqrtT;
 
   CNDD1 = cndGPU(d1);
   CNDD2 = cndGPU(d2);
 
   // Calculate Call and Put simultaneously
-  expRT = __expf(-R * T);
+  expRT = expf(-R * T);
   CallResult = S * CNDD1 - X * expRT * CNDD2;
   PutResult = X * expRT * (1.0f - CNDD2) - S * (1.0f - CNDD1);
 }
@@ -75,13 +75,12 @@ __device__ inline void BlackScholesBodyGPU(float &CallResult, float &PutResult,
 ////////////////////////////////////////////////////////////////////////////////
 // Process an array of optN options on GPU
 ////////////////////////////////////////////////////////////////////////////////
-__launch_bounds__(128) __global__
-    void BlackScholesGPU(float2 *__restrict d_CallResult,
-                         float2 *__restrict d_PutResult,
-                         float2 *__restrict d_StockPrice,
-                         float2 *__restrict d_OptionStrike,
-                         float2 *__restrict d_OptionYears, float Riskfree,
-                         float Volatility, int optN) {
+__global__ void BlackScholesGPU(float2* __restrict d_CallResult,
+                                float2* __restrict d_PutResult,
+                                float2* __restrict d_StockPrice,
+                                float2* __restrict d_OptionStrike,
+                                float2* __restrict d_OptionYears,
+                                float Riskfree, float Volatility, int optN) {
   ////Thread index
   // const int      tid = blockDim.x * blockIdx.x + threadIdx.x;
   ////Total number of threads in execution grid
@@ -100,7 +99,9 @@ __launch_bounds__(128) __global__
     BlackScholesBodyGPU(callResult2, putResult2, d_StockPrice[opt].y,
                         d_OptionStrike[opt].y, d_OptionYears[opt].y, Riskfree,
                         Volatility);
-    d_CallResult[opt] = make_float2(callResult1, callResult2);
-    d_PutResult[opt] = make_float2(putResult1, putResult2);
+    d_CallResult[opt].x = callResult1;
+    d_CallResult[opt].y = callResult2;
+    d_PutResult[opt].x = putResult1;
+    d_PutResult[opt].y = putResult2;
   }
 }
diff --git a/demos/CUDA/TensorContraction.cu b/demos/CUDA/TensorContraction.cu
index a1adc48bc..8e1efaf9b 100644
--- a/demos/CUDA/TensorContraction.cu
+++ b/demos/CUDA/TensorContraction.cu
@@ -13,7 +13,6 @@
 //          -L/path/to/cuda/lib64 -lcudart_static -ldl -lrt -pthread -lm -lstdc++
 // RUN: ./TensorContraction
 
-#include "cuda_runtime_api.h"
 #include "clad/Differentiator/Differentiator.h"
 
 typedef unsigned long long int size_type;
diff --git a/include/clad/Differentiator/BuiltinDerivatives.h b/include/clad/Differentiator/BuiltinDerivatives.h
index 62296ab92..8b8b4e09a 100644
--- a/include/clad/Differentiator/BuiltinDerivatives.h
+++ b/include/clad/Differentiator/BuiltinDerivatives.h
@@ -386,6 +386,28 @@ inline void free_pushforward(void* ptr, void* d_ptr) {
 // NOLINTEND(cppcoreguidelines-owning-memory)
 // NOLINTEND(cppcoreguidelines-no-malloc)
 
+CUDA_HOST_DEVICE inline void expf_pullback(float a, float d_y, float* d_a) {
+  *d_a += expf(a) * d_y;
+}
+
+CUDA_HOST_DEVICE inline void fabsf_pullback(float a, float d_y, float* d_a) {
+  *d_a += (a >= 0) ? d_y : -d_y;
+}
+
+CUDA_HOST_DEVICE inline void logf_pullback(float a, float d_y, float* d_a) {
+  *d_a += (1.F / a) * d_y;
+}
+
+CUDA_HOST_DEVICE inline void fdividef_pullback(float a, float b, float d_y,
+                                               float* d_a, float* d_b) {
+  *d_a += (1.F / b) * d_y;
+  *d_b += (-a / (b * b)) * d_y;
+}
+
+CUDA_HOST_DEVICE inline void sqrtf_pullback(float a, float d_y, float* d_a) {
+  *d_a += (1.F / (2.F * sqrtf(a))) * d_y;
+}
+
 // These are required because C variants of mathematical functions are
 // defined in global namespace.
 using std::abs_pushforward;