From 952cb774d2748a33aef4676cb70fc1933404be0c Mon Sep 17 00:00:00 2001 From: kchristin Date: Sun, 17 Nov 2024 16:16:29 +0200 Subject: [PATCH] Add CUDA builtins in different file and keep BlackScholes.cuh same as original --- .../CUDA/BlackScholes/BlackScholes_kernel.cuh | 16 +++------- .../clad/Differentiator/BuiltinDerivatives.h | 23 ------------- .../Differentiator/BuiltinDerivativesCUDA.cuh | 32 +++++++++++++++++++ include/clad/Differentiator/Differentiator.h | 3 ++ 4 files changed, 40 insertions(+), 34 deletions(-) create mode 100644 include/clad/Differentiator/BuiltinDerivativesCUDA.cuh diff --git a/demos/CUDA/BlackScholes/BlackScholes_kernel.cuh b/demos/CUDA/BlackScholes/BlackScholes_kernel.cuh index 1db634b4c..aed2ec643 100644 --- a/demos/CUDA/BlackScholes/BlackScholes_kernel.cuh +++ b/demos/CUDA/BlackScholes/BlackScholes_kernel.cuh @@ -25,12 +25,6 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -/* - * DISCLAIMER: The following file has been slightly modified to ensure - * compatibility with Clad. The original file is available in NVIDIA's - * cuda-samples repository on GitHub. - */ - //////////////////////////////////////////////////////////////////////////////// // Polynomial approximation of cumulative normal distribution function //////////////////////////////////////////////////////////////////////////////// @@ -42,9 +36,9 @@ __device__ inline float cndGPU(float d) { const float A5 = 1.330274429f; const float RSQRT2PI = 0.39894228040143267793994605993438f; - float K = fdividef(1.0f, (1.0f + 0.2316419f * fabsf(d))); + float K = __fdividef(1.0f, (1.0f + 0.2316419f * fabsf(d))); - float cnd = RSQRT2PI * expf(-0.5f * d * d) * + float cnd = RSQRT2PI * __expf(-0.5f * d * d) * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))); if (d > 0) @@ -66,15 +60,15 @@ __device__ inline void BlackScholesBodyGPU(float& CallResult, float& PutResult, float sqrtT, expRT; float d1, d2, CNDD1, CNDD2; - sqrtT = sqrtf(T); - d1 = fdividef(logf(S / X) + (R + 0.5f * V * V) * T, V * sqrtT); + sqrtT = __fdividef(1.0F, rsqrtf(T)); + d1 = __fdividef(__logf(S / X) + (R + 0.5f * V * V) * T, V * sqrtT); d2 = d1 - V * sqrtT; CNDD1 = cndGPU(d1); CNDD2 = cndGPU(d2); // Calculate Call and Put simultaneously - expRT = expf(-R * T); + expRT = __expf(-R * T); CallResult = S * CNDD1 - X * expRT * CNDD2; PutResult = X * expRT * (1.0f - CNDD2) - S * (1.0f - CNDD1); } diff --git a/include/clad/Differentiator/BuiltinDerivatives.h b/include/clad/Differentiator/BuiltinDerivatives.h index b2f6ba3e1..cea607692 100644 --- a/include/clad/Differentiator/BuiltinDerivatives.h +++ b/include/clad/Differentiator/BuiltinDerivatives.h @@ -386,37 +386,14 @@ inline void free_pushforward(void* ptr, void* d_ptr) { // NOLINTEND(cppcoreguidelines-owning-memory) // NOLINTEND(cppcoreguidelines-no-malloc) -CUDA_HOST_DEVICE inline void expf_pullback(float a, float d_y, float* d_a) { - *d_a += expf(a) * d_y; -} - CUDA_HOST_DEVICE inline void fabsf_pullback(float a, float d_y, float* d_a) { *d_a += (a >= 0) ? d_y : -d_y; } -CUDA_HOST_DEVICE inline void logf_pullback(float a, float d_y, float* d_a) { - *d_a += (1.F / a) * d_y; -} - -CUDA_HOST_DEVICE inline void fdividef_pullback(float a, float b, float d_y, - float* d_a, float* d_b) { - *d_a += (1.F / b) * d_y; - *d_b += (-a / (b * b)) * d_y; -} - CUDA_HOST_DEVICE inline void sqrtf_pullback(float a, float d_y, float* d_a) { *d_a += (1.F / (2.F * sqrtf(a))) * d_y; } - -#ifdef __CUDACC__ -CUDA_HOST_DEVICE inline void make_float2_pullback(float a, float b, float2 d_y, - float* d_a, float* d_b) { - *d_a += d_y.x; - *d_b += d_y.y; -} -#endif - // These are required because C variants of mathematical functions are // defined in global namespace. using std::abs_pushforward; diff --git a/include/clad/Differentiator/BuiltinDerivativesCUDA.cuh b/include/clad/Differentiator/BuiltinDerivativesCUDA.cuh new file mode 100644 index 000000000..9179a7856 --- /dev/null +++ b/include/clad/Differentiator/BuiltinDerivativesCUDA.cuh @@ -0,0 +1,32 @@ +#include "clad/Differentiator/CladConfig.h" + +namespace clad { + +namespace custom_derivatives { + +__device__ inline void __expf_pullback(float a, float d_y, float* d_a) { + *d_a += expf(a) * d_y; +} + +__device__ inline void __logf_pullback(float a, float d_y, float* d_a) { + *d_a += (1.F / a) * d_y; +} + +__device__ inline void __fdividef_pullback(float a, float b, float d_y, + float* d_a, float* d_b) { + *d_a += (1.F / b) * d_y; + *d_b += (-a / (b * b)) * d_y; +} + +__device__ inline void rsqrtf_pullback(float a, float d_y, float* d_a) { + // Compute the gradient of rsqrt with respect to x + *d_a = d_y * (-0.5 * powf(a, -1.5)); +} + +__device__ inline void make_float2_pullback(float a, float b, float2 d_y, + float* d_a, float* d_b) { + *d_a += d_y.x; + *d_b += d_y.y; +} +} +} diff --git a/include/clad/Differentiator/Differentiator.h b/include/clad/Differentiator/Differentiator.h index a4d450aff..50d6cc34e 100644 --- a/include/clad/Differentiator/Differentiator.h +++ b/include/clad/Differentiator/Differentiator.h @@ -10,6 +10,9 @@ #include "Array.h" #include "ArrayRef.h" #include "BuiltinDerivatives.h" +#ifdef __CUDACC__ +#include "BuiltinDerivativesCUDA.cuh" +#endif #include "CladConfig.h" #include "DynamicGraph.h" #include "FunctionTraits.h"