From f895ad6f7f5e6d26efad6d83b123b37cd873cd12 Mon Sep 17 00:00:00 2001 From: kchristin Date: Sat, 2 Nov 2024 15:32:45 +0200 Subject: [PATCH 01/18] Add CUDA Black-Scholes demo --- demos/CUDA/BlackScholes/BlackScholes.cu | 405 ++++++++ demos/CUDA/BlackScholes/BlackScholes_gold.cpp | 93 ++ .../CUDA/BlackScholes/BlackScholes_kernel.cuh | 108 ++ demos/CUDA/BlackScholes/Makefile | 41 + demos/CUDA/BlackScholes/helper/helper_cuda.h | 951 +++++++++++++++++ .../BlackScholes/helper/helper_functions.h | 59 ++ demos/CUDA/BlackScholes/helper/helper_image.h | 961 ++++++++++++++++++ .../CUDA/BlackScholes/helper/helper_string.h | 441 ++++++++ demos/CUDA/BlackScholes/helper/helper_timer.h | 448 ++++++++ .../clad/Differentiator/BuiltinDerivatives.h | 22 + 10 files changed, 3529 insertions(+) create mode 100644 demos/CUDA/BlackScholes/BlackScholes.cu create mode 100644 demos/CUDA/BlackScholes/BlackScholes_gold.cpp create mode 100644 demos/CUDA/BlackScholes/BlackScholes_kernel.cuh create mode 100644 demos/CUDA/BlackScholes/Makefile create mode 100644 demos/CUDA/BlackScholes/helper/helper_cuda.h create mode 100644 demos/CUDA/BlackScholes/helper/helper_functions.h create mode 100644 demos/CUDA/BlackScholes/helper/helper_image.h create mode 100644 demos/CUDA/BlackScholes/helper/helper_string.h create mode 100644 demos/CUDA/BlackScholes/helper/helper_timer.h diff --git a/demos/CUDA/BlackScholes/BlackScholes.cu b/demos/CUDA/BlackScholes/BlackScholes.cu new file mode 100644 index 000000000..3c3617b44 --- /dev/null +++ b/demos/CUDA/BlackScholes/BlackScholes.cu @@ -0,0 +1,405 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * This sample evaluates fair call and put prices for a + * given set of European options by Black-Scholes formula. + * See supplied whitepaper for more explanations. + */ + +/* + * DISCLAIMER: The following file has been modified slightly to make it + * compatible with Clad. The original file can be found at NVIDIA's cuda-samples + * repository at GitHub. + * + * Relevant documentation regarding the problem at hand can be found at NVIDIA's + * cuda-samples repository. With the use of Clad, we compute some of the Greeks + * (sensitivities) for Black-Scholes and verify them using the + * theoretical values as denoted in Wikipedia + * (https://en.wikipedia.org/wiki/Black%E2%80%93Scholes_model). + * + * To build and run the demo, run the following command: make run + */ + +#include "clad/Differentiator/Differentiator.h" + +#include // helper functions CUDA error checking and initialization +#include // helper functions for string parsing + +//////////////////////////////////////////////////////////////////////////////// +// Process an array of optN options on CPU +//////////////////////////////////////////////////////////////////////////////// +extern "C" void BlackScholesCPU(float* h_CallResult, float* h_PutResult, + float* h_StockPrice, float* h_OptionStrike, + float* h_OptionYears, float Riskfree, + float Volatility, int optN); +extern "C" double CND(double d); + +//////////////////////////////////////////////////////////////////////////////// +// Process an array of OptN options on GPU +//////////////////////////////////////////////////////////////////////////////// +#include "BlackScholes_kernel.cuh" + +//////////////////////////////////////////////////////////////////////////////// +// Helper function, returning uniformly distributed +// random float in [low, high] range +//////////////////////////////////////////////////////////////////////////////// +float RandFloat(float low, float high) { + float t = (float)rand() / (float)RAND_MAX; + return (1.0f - t) * low + t * high; +} + +//////////////////////////////////////////////////////////////////////////////// +// Data configuration +//////////////////////////////////////////////////////////////////////////////// +const int OPT_N = 4000000; +const int NUM_ITERATIONS = 512; + +const int OPT_SZ = OPT_N * sizeof(float); +const float RISKFREE = 0.02f; +const float VOLATILITY = 0.30f; + +#define DIV_UP(a, b) (((a) + (b) - 1) / (b)) + +//////////////////////////////////////////////////////////////////////////////// +// Main program +//////////////////////////////////////////////////////////////////////////////// + +void launch(float* h_CallResultCPU, float* h_CallResultGPU, + float* h_PutResultCPU, float* h_PutResultGPU, float* h_StockPrice, + float* h_OptionStrike, float* h_OptionYears) { + + //'d_' prefix - GPU (device) memory space + float + // Results calculated by GPU + *d_CallResult = nullptr, + *d_PutResult = nullptr, + // GPU instance of input data + *d_StockPrice = nullptr, *d_OptionStrike = nullptr, + *d_OptionYears = nullptr; + + cudaMalloc((void**)&d_CallResult, OPT_SZ); + cudaMalloc((void**)&d_PutResult, OPT_SZ); + cudaMalloc((void**)&d_StockPrice, OPT_SZ); + cudaMalloc((void**)&d_OptionStrike, OPT_SZ); + cudaMalloc((void**)&d_OptionYears, OPT_SZ); + + // Copy options data to GPU memory for further processing + cudaMemcpy(d_StockPrice, h_StockPrice, OPT_SZ, cudaMemcpyHostToDevice); + cudaMemcpy(d_OptionStrike, h_OptionStrike, OPT_SZ, cudaMemcpyHostToDevice); + cudaMemcpy(d_OptionYears, h_OptionYears, OPT_SZ, cudaMemcpyHostToDevice); + + BlackScholesGPU<<>>( + (float2*)d_CallResult, (float2*)d_PutResult, (float2*)d_StockPrice, + (float2*)d_OptionStrike, (float2*)d_OptionYears, RISKFREE, VOLATILITY, + OPT_N); + + // Both call and put is calculated + + // Read back GPU results to compare them to CPU results + cudaMemcpy(h_CallResultGPU, d_CallResult, OPT_SZ, cudaMemcpyDeviceToHost); + cudaMemcpy(h_PutResultGPU, d_PutResult, OPT_SZ, cudaMemcpyDeviceToHost); + + // Calculate options values on CPU + BlackScholesCPU(h_CallResultCPU, h_PutResultCPU, h_StockPrice, h_OptionStrike, + h_OptionYears, RISKFREE, VOLATILITY, OPT_N); + + cudaFree(d_OptionYears); + cudaFree(d_OptionStrike); + cudaFree(d_StockPrice); + cudaFree(d_PutResult); + cudaFree(d_CallResult); +} + +double d1(double S, double X, double T) { + return (log(S / X) + (RISKFREE + 0.5 * VOLATILITY * VOLATILITY) * T) / + (VOLATILITY * sqrt(T)); +} + +double N_prime(double d) { + const double RSQRT2PI = + 0.39894228040143267793994605993438; // 1 / sqrt(2 * PI) + return RSQRT2PI * exp(-0.5 * d * d); +} + +enum Greek { Delta, dX, Theta }; + +double computeL1norm_Call(float* S, float* X, float* T, float* d, Greek greek) { + double delta, ref, sum_delta, sum_ref; + sum_delta = 0; + sum_ref = 0; + switch (greek) { + case Delta: + for (int i = 0; i < OPT_N; i++) { + double d1_val = d1(S[i], X[i], T[i]); + ref = CND(d1_val); + delta = fabs(d[i] - ref); + sum_delta += delta; + sum_ref += fabs(ref); + } + break; + case dX: + for (int i = 0; i < OPT_N; i++) { + double T_val = T[i]; + double d1_val = d1(S[i], X[i], T_val); + double d2_val = d1_val - VOLATILITY * sqrt(T_val); + double expRT = exp(-RISKFREE * T_val); + ref = -expRT * CND(d2_val); + delta = fabs(d[i] - ref); + sum_delta += delta; + sum_ref += fabs(ref); + } + break; + case Theta: + for (int i = 0; i < OPT_N; i++) { + double S_val = S[i], X_val = X[i], T_val = T[i]; + double d1_val = d1(S_val, X_val, T_val); + double d2_val = d1_val - VOLATILITY * sqrt(T_val); + double expRT = exp(-RISKFREE * T_val); + ref = + (S_val * N_prime(d1_val) * VOLATILITY) / (2 * sqrt(T_val)) + + RISKFREE * X_val * expRT * + CND(d2_val); // theta is with respect to t, so -theta is the + // approximation of the derivative with respect to T + delta = fabs(d[i] - ref); + sum_delta += delta; + sum_ref += fabs(ref); + } + } + + return sum_delta / sum_ref; +} + +double computeL1norm_Put(float* S, float* X, float* T, float* d, Greek greek) { + double delta, ref, sum_delta, sum_ref; + sum_delta = 0; + sum_ref = 0; + switch (greek) { + case Delta: + for (int i = 0; i < OPT_N; i++) { + double d1_val = d1(S[i], X[i], T[i]); + ref = CND(d1_val) - 1.0; + delta = fabs(d[i] - ref); + sum_delta += delta; + sum_ref += fabs(ref); + } + break; + case dX: + for (int i = 0; i < OPT_N; i++) { + double T_val = T[i]; + double d1_val = d1(S[i], X[i], T_val); + double d2_val = d1_val - VOLATILITY * sqrt(T_val); + double expRT = exp(-RISKFREE * T_val); + ref = expRT * CND(-d2_val); + delta = fabs(d[i] - ref); + sum_delta += delta; + sum_ref += fabs(ref); + } + break; + case Theta: + for (int i = 0; i < OPT_N; i++) { + double S_val = S[i], X_val = X[i], T_val = T[i]; + double d1_val = d1(S_val, X_val, T_val); + double d2_val = d1_val - VOLATILITY * sqrt(T_val); + double expRT = exp(-RISKFREE * T_val); + ref = (S_val * N_prime(d1_val) * VOLATILITY) / (2 * sqrt(T_val)) - + RISKFREE * X_val * expRT * CND(-d2_val); + delta = fabs(d[i] - ref); + sum_delta += delta; + sum_ref += fabs(ref); + } + } + + return sum_delta / sum_ref; +} + +int main(int argc, char** argv) { + float* h_CallResultCPU = (float*)malloc(OPT_SZ); + float* h_PutResultCPU = (float*)malloc(OPT_SZ); + float* h_CallResultGPU = (float*)malloc(OPT_SZ); + float* h_PutResultGPU = (float*)malloc(OPT_SZ); + float* h_StockPrice = (float*)malloc(OPT_SZ); + float* h_OptionStrike = (float*)malloc(OPT_SZ); + float* h_OptionYears = (float*)malloc(OPT_SZ); + + srand(5347); + + // Generate options set + for (int i = 0; i < OPT_N; i++) { + h_CallResultCPU[i] = 0.0f; + h_PutResultCPU[i] = -1.0f; + h_StockPrice[i] = RandFloat(5.0f, 30.0f); + h_OptionStrike[i] = RandFloat(1.0f, 100.0f); + h_OptionYears[i] = RandFloat(0.25f, 10.0f); + } + + // Compute gradients + auto callGrad = clad::gradient( + launch, "h_CallResultGPU, h_StockPrice, h_OptionStrike, h_OptionYears"); + auto putGrad = clad::gradient( + launch, "h_PutResultGPU, h_StockPrice, h_OptionStrike, h_OptionYears"); + + // Declare and initialize the derivatives + float* d_CallResultGPU = (float*)malloc(OPT_SZ); + float* d_PutResultGPU = (float*)malloc(OPT_SZ); + float* d_StockPrice = (float*)calloc(OPT_N, sizeof(float)); + float* d_OptionStrike = (float*)calloc(OPT_N, sizeof(float)); + float* d_OptionYears = (float*)calloc(OPT_N, sizeof(float)); + + for (int i = 0; i < OPT_N; i++) { + d_CallResultGPU[i] = 1.0f; + d_PutResultGPU[i] = 1.0f; + } + + // Launch the kernel and the gradient + + // Compute the derivatives of the price of the call options + callGrad.execute(h_CallResultCPU, h_CallResultGPU, h_PutResultCPU, + h_PutResultGPU, h_StockPrice, h_OptionStrike, h_OptionYears, + d_CallResultGPU, d_StockPrice, d_OptionStrike, + d_OptionYears); + + // Calculate max absolute difference and L1 distance + // between CPU and GPU results + double delta, ref, sum_delta, sum_ref, L1norm; + sum_delta = 0; + sum_ref = 0; + + for (int i = 0; i < OPT_N; i++) { + ref = h_CallResultCPU[i]; + delta = fabs(h_CallResultCPU[i] - h_CallResultGPU[i]); + sum_delta += delta; + sum_ref += fabs(ref); + } + + L1norm = sum_delta / sum_ref; + printf("L1norm = %E\n", L1norm); + if (L1norm > 1e-6) { + printf("Original test failed\n"); + return EXIT_FAILURE; + } + + // Verify delta + L1norm = computeL1norm_Call(h_StockPrice, h_OptionStrike, h_OptionYears, + d_StockPrice, Delta); + printf("L1norm of delta for Call option = %E\n", L1norm); + if (L1norm > 1e-5) { + printf("Gradient test failed: the difference between the computed and the " + "approximated theoretical delta for Call option is larger than " + "expected\n"); + return EXIT_FAILURE; + } + + // Verify derivatives with respect to the Strike price + L1norm = computeL1norm_Call(h_StockPrice, h_OptionStrike, h_OptionYears, + d_OptionStrike, dX); + printf("L1norm of derivative of Call w.r.t. the strike price = %E\n", L1norm); + if (L1norm > 1e-5) { + printf( + "Gradient test failed: the difference between the computed and the " + "approximated theoretical derivative of Call w.r.t. the strike price " + "is larger than expected\n"); + return EXIT_FAILURE; + } + + // Verify theta + L1norm = computeL1norm_Call(h_StockPrice, h_OptionStrike, h_OptionYears, + d_OptionYears, Theta); + printf("L1norm of theta for Call option = %E\n", L1norm); + if (L1norm > 1e-5) { + printf("Gradient test failed: the difference between the computed and the " + "approximated theoretical theta for Call option is larger than " + "expected\n"); + return EXIT_FAILURE; + } + + // Compute the derivatives of the price of the Put options + for (int i = 0; i < OPT_N; i++) { + h_CallResultCPU[i] = 0.0f; + h_PutResultCPU[i] = -1.0f; + d_CallResultGPU[i] = 1.0f; + d_PutResultGPU[i] = 1.0f; + } + + for (int i = 0; i < OPT_N; i++) { + d_StockPrice[i] = 0.f; + d_OptionStrike[i] = 0.f; + d_OptionYears[i] = 0.f; + } + + putGrad.execute(h_CallResultCPU, h_CallResultGPU, h_PutResultCPU, + h_PutResultGPU, h_StockPrice, h_OptionStrike, h_OptionYears, + d_PutResultGPU, d_StockPrice, d_OptionStrike, d_OptionYears); + + // Verify delta + L1norm = computeL1norm_Put(h_StockPrice, h_OptionStrike, h_OptionYears, + d_StockPrice, Delta); + printf("L1norm of delta for Put option = %E\n", L1norm); + if (L1norm > 1e-5) { + printf("Gradient test failed: the difference between the computed and " + "the approximated theoretical delta for Put option is larger than " + "expected\n"); + return EXIT_FAILURE; + } + + // Verify derivatives with respect to the Strike price + L1norm = computeL1norm_Put(h_StockPrice, h_OptionStrike, h_OptionYears, + d_OptionStrike, dX); + printf("L1norm of derivative of Put w.r.t. the strike price = %E\n", L1norm); + if (L1norm > 1e-6) { + printf("Gradient test failed: the difference between the computed and the " + "approximated theoretcial derivative of " + "Put w.r.t. the strike price is larger than expected\n"); + return EXIT_FAILURE; + } + + // Verify theta + L1norm = computeL1norm_Put(h_StockPrice, h_OptionStrike, h_OptionYears, + d_OptionYears, Theta); + printf("L1norm of theta for Put option = %E\n", L1norm); + if (L1norm > 1e-5) { + printf("Gradient test failed: the difference between the computed and the " + "approximated theoretical theta for Put option is larger than " + "expected\n"); + return EXIT_FAILURE; + } + + free(h_OptionYears); + free(h_OptionStrike); + free(h_StockPrice); + free(h_PutResultGPU); + free(h_CallResultGPU); + free(h_PutResultCPU); + free(h_CallResultCPU); + free(d_OptionYears); + free(d_OptionStrike); + free(d_StockPrice); + free(d_PutResultGPU); + free(d_CallResultGPU); + + return EXIT_SUCCESS; +} diff --git a/demos/CUDA/BlackScholes/BlackScholes_gold.cpp b/demos/CUDA/BlackScholes/BlackScholes_gold.cpp new file mode 100644 index 000000000..6b0d9c8ef --- /dev/null +++ b/demos/CUDA/BlackScholes/BlackScholes_gold.cpp @@ -0,0 +1,93 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * DISCLAIMER: The following file has been modified slightly to make it + * compatible with Clad. The original file can be found at NVIDIA's cuda-samples + * repository at GitHub. + */ + +#include + +//////////////////////////////////////////////////////////////////////////////// +// Polynomial approximation of cumulative normal distribution function +//////////////////////////////////////////////////////////////////////////////// +extern "C" double CND(double d) { + const double A1 = 0.31938153; + const double A2 = -0.356563782; + const double A3 = 1.781477937; + const double A4 = -1.821255978; + const double A5 = 1.330274429; + const double RSQRT2PI = 0.39894228040143267793994605993438; + + double K = 1.0 / (1.0 + 0.2316419 * fabs(d)); + + double cnd = RSQRT2PI * exp(-0.5 * d * d) * + (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))); + + if (d > 0) + cnd = 1.0 - cnd; + + return cnd; +} + +//////////////////////////////////////////////////////////////////////////////// +// Black-Scholes formula for both call and put +//////////////////////////////////////////////////////////////////////////////// +static void BlackScholesBodyCPU(float& callResult, float& putResult, + float Sf, // Stock price + float Xf, // Option strike + float Tf, // Option years + float Rf, // Riskless rate + float Vf // Volatility rate +) { + double S = Sf, X = Xf, T = Tf, R = Rf, V = Vf; + + double sqrtT = sqrt(T); + double d1 = (log(S / X) + (R + 0.5 * V * V) * T) / (V * sqrtT); + double d2 = d1 - V * sqrtT; + double CNDD1 = CND(d1); + double CNDD2 = CND(d2); + + // Calculate Call and Put simultaneously + double expRT = exp(-R * T); + callResult = (float)(S * CNDD1 - X * expRT * CNDD2); + putResult = (float)(X * expRT * (1.0 - CNDD2) - S * (1.0 - CNDD1)); +} + +//////////////////////////////////////////////////////////////////////////////// +// Process an array of optN options +//////////////////////////////////////////////////////////////////////////////// +extern "C" void BlackScholesCPU(float* h_CallResult, float* h_PutResult, + float* h_StockPrice, float* h_OptionStrike, + float* h_OptionYears, float Riskfree, + float Volatility, int optN) { + for (int opt = 0; opt < optN; opt++) + BlackScholesBodyCPU(h_CallResult[opt], h_PutResult[opt], h_StockPrice[opt], + h_OptionStrike[opt], h_OptionYears[opt], Riskfree, + Volatility); +} diff --git a/demos/CUDA/BlackScholes/BlackScholes_kernel.cuh b/demos/CUDA/BlackScholes/BlackScholes_kernel.cuh new file mode 100644 index 000000000..26497b8ac --- /dev/null +++ b/demos/CUDA/BlackScholes/BlackScholes_kernel.cuh @@ -0,0 +1,108 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +//////////////////////////////////////////////////////////////////////////////// +// Polynomial approximation of cumulative normal distribution function +//////////////////////////////////////////////////////////////////////////////// +__device__ inline float cndGPU(float d) { + const float A1 = 0.31938153f; + const float A2 = -0.356563782f; + const float A3 = 1.781477937f; + const float A4 = -1.821255978f; + const float A5 = 1.330274429f; + const float RSQRT2PI = 0.39894228040143267793994605993438f; + + float K = fdividef(1.0f, (1.0f + 0.2316419f * fabsf(d))); + + float cnd = RSQRT2PI * expf(-0.5f * d * d) * + (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))); + + if (d > 0) + cnd = 1.0f - cnd; + + return cnd; +} + +//////////////////////////////////////////////////////////////////////////////// +// Black-Scholes formula for both call and put +//////////////////////////////////////////////////////////////////////////////// +__device__ inline void BlackScholesBodyGPU(float& CallResult, float& PutResult, + float S, // Stock price + float X, // Option strike + float T, // Option years + float R, // Riskless rate + float V // Volatility rate +) { + float sqrtT, expRT; + float d1, d2, CNDD1, CNDD2; + + sqrtT = fdividef(1.0F, 1.0 / sqrtf(T)); + d1 = fdividef(logf(S / X) + (R + 0.5f * V * V) * T, V * sqrtT); + d2 = d1 - V * sqrtT; + + CNDD1 = cndGPU(d1); + CNDD2 = cndGPU(d2); + + // Calculate Call and Put simultaneously + expRT = expf(-R * T); + CallResult = S * CNDD1 - X * expRT * CNDD2; + PutResult = X * expRT * (1.0f - CNDD2) - S * (1.0f - CNDD1); +} + +//////////////////////////////////////////////////////////////////////////////// +// Process an array of optN options on GPU +//////////////////////////////////////////////////////////////////////////////// +__global__ void BlackScholesGPU(float2* __restrict d_CallResult, + float2* __restrict d_PutResult, + float2* __restrict d_StockPrice, + float2* __restrict d_OptionStrike, + float2* __restrict d_OptionYears, + float Riskfree, float Volatility, int optN) { + ////Thread index + // const int tid = blockDim.x * blockIdx.x + threadIdx.x; + ////Total number of threads in execution grid + // const int THREAD_N = blockDim.x * gridDim.x; + + const int opt = blockDim.x * blockIdx.x + threadIdx.x; + + // Calculating 2 options per thread to increase ILP (instruction level + // parallelism) + if (opt < (optN / 2)) { + float callResult1, callResult2; + float putResult1, putResult2; + BlackScholesBodyGPU(callResult1, putResult1, d_StockPrice[opt].x, + d_OptionStrike[opt].x, d_OptionYears[opt].x, Riskfree, + Volatility); + BlackScholesBodyGPU(callResult2, putResult2, d_StockPrice[opt].y, + d_OptionStrike[opt].y, d_OptionYears[opt].y, Riskfree, + Volatility); + d_CallResult[opt].x = callResult1; + d_CallResult[opt].y = callResult2; + d_PutResult[opt].x = putResult1; + d_PutResult[opt].y = putResult2; + } +} diff --git a/demos/CUDA/BlackScholes/Makefile b/demos/CUDA/BlackScholes/Makefile new file mode 100644 index 000000000..665939732 --- /dev/null +++ b/demos/CUDA/BlackScholes/Makefile @@ -0,0 +1,41 @@ +# Paths and Compiler Settings +LLVM_PATH = /usr/lib/llvm-17 +CLANG = $(LLVM_PATH)/bin/clang +CUDA_PATH ?= /usr/local/cuda-11.8 +CLAD_PATH = $(CURDIR)/../../.. +CLAD_PLUGIN = $(CLAD_PATH)/build/./lib/clad.so + +# Compiler flags +CXXFLAGS = -std=c++17 -Xclang -add-plugin -Xclang clad +CXXFLAGS += -Xclang -load -Xclang $(CLAD_PLUGIN) +CXXFLAGS += -I$(CLAD_PATH)/include -I$(CURDIR)/helper -I$(CUDA_PATH)/include + +CUDA_FLAGS = --cuda-path=$(CUDA_PATH) --cuda-gpu-arch=sm_60 + +# Linker flags +LDFLAGS = -L$(CUDA_PATH)/lib64 -lcudart_static -ldl -lrt -pthread -lm -lstdc++ + +all: build + +.SILENT: build run clean clobber BlackScholes BlackScholes.o BlackScholes_gold.o + +build: BlackScholes + +BlackScholes.o:BlackScholes.cu + $(CLANG) $(CXXFLAGS) -o $@ -c $< $(CUDA_FLAGS) + +BlackScholes_gold.o:BlackScholes_gold.cpp + $(CLANG) $(CXXFLAGS) -o $@ -c $< + +BlackScholes: BlackScholes.o BlackScholes_gold.o + $(CLANG) $(CXXFLAGS) -o $@ BlackScholes.o BlackScholes_gold.o $(LDFLAGS) + +run: build + ./BlackScholes + +testrun: build + +clean: + rm -f BlackScholes BlackScholes.o BlackScholes_gold.o + +clobber: clean \ No newline at end of file diff --git a/demos/CUDA/BlackScholes/helper/helper_cuda.h b/demos/CUDA/BlackScholes/helper/helper_cuda.h new file mode 100644 index 000000000..6666e8208 --- /dev/null +++ b/demos/CUDA/BlackScholes/helper/helper_cuda.h @@ -0,0 +1,951 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +//////////////////////////////////////////////////////////////////////////////// +// These are CUDA Helper functions for initialization and error checking + +#ifndef COMMON_HELPER_CUDA_H_ +#define COMMON_HELPER_CUDA_H_ + +#pragma once + +#include +#include +#include +#include + +#include + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +// Note, it is required that your SDK sample to include the proper header +// files, please refer the CUDA examples for examples of the needed CUDA +// headers, which may change depending on which CUDA functions are used. + +// CUDA Runtime error messages +#ifdef __DRIVER_TYPES_H__ +static const char* _cudaGetErrorEnum(cudaError_t error) { + return cudaGetErrorName(error); +} +#endif + +#ifdef CUDA_DRIVER_API +// CUDA Driver API errors +static const char* _cudaGetErrorEnum(CUresult error) { + static char unknown[] = ""; + const char* ret = NULL; + cuGetErrorName(error, &ret); + return ret ? ret : unknown; +} +#endif + +#ifdef CUBLAS_API_H_ +// cuBLAS API errors +static const char* _cudaGetErrorEnum(cublasStatus_t error) { + switch (error) { + case CUBLAS_STATUS_SUCCESS: + return "CUBLAS_STATUS_SUCCESS"; + + case CUBLAS_STATUS_NOT_INITIALIZED: + return "CUBLAS_STATUS_NOT_INITIALIZED"; + + case CUBLAS_STATUS_ALLOC_FAILED: + return "CUBLAS_STATUS_ALLOC_FAILED"; + + case CUBLAS_STATUS_INVALID_VALUE: + return "CUBLAS_STATUS_INVALID_VALUE"; + + case CUBLAS_STATUS_ARCH_MISMATCH: + return "CUBLAS_STATUS_ARCH_MISMATCH"; + + case CUBLAS_STATUS_MAPPING_ERROR: + return "CUBLAS_STATUS_MAPPING_ERROR"; + + case CUBLAS_STATUS_EXECUTION_FAILED: + return "CUBLAS_STATUS_EXECUTION_FAILED"; + + case CUBLAS_STATUS_INTERNAL_ERROR: + return "CUBLAS_STATUS_INTERNAL_ERROR"; + + case CUBLAS_STATUS_NOT_SUPPORTED: + return "CUBLAS_STATUS_NOT_SUPPORTED"; + + case CUBLAS_STATUS_LICENSE_ERROR: + return "CUBLAS_STATUS_LICENSE_ERROR"; + } + + return ""; +} +#endif + +#ifdef _CUFFT_H_ +// cuFFT API errors +static const char* _cudaGetErrorEnum(cufftResult error) { + switch (error) { + case CUFFT_SUCCESS: + return "CUFFT_SUCCESS"; + + case CUFFT_INVALID_PLAN: + return "CUFFT_INVALID_PLAN"; + + case CUFFT_ALLOC_FAILED: + return "CUFFT_ALLOC_FAILED"; + + case CUFFT_INVALID_TYPE: + return "CUFFT_INVALID_TYPE"; + + case CUFFT_INVALID_VALUE: + return "CUFFT_INVALID_VALUE"; + + case CUFFT_INTERNAL_ERROR: + return "CUFFT_INTERNAL_ERROR"; + + case CUFFT_EXEC_FAILED: + return "CUFFT_EXEC_FAILED"; + + case CUFFT_SETUP_FAILED: + return "CUFFT_SETUP_FAILED"; + + case CUFFT_INVALID_SIZE: + return "CUFFT_INVALID_SIZE"; + + case CUFFT_UNALIGNED_DATA: + return "CUFFT_UNALIGNED_DATA"; + + case CUFFT_INCOMPLETE_PARAMETER_LIST: + return "CUFFT_INCOMPLETE_PARAMETER_LIST"; + + case CUFFT_INVALID_DEVICE: + return "CUFFT_INVALID_DEVICE"; + + case CUFFT_PARSE_ERROR: + return "CUFFT_PARSE_ERROR"; + + case CUFFT_NO_WORKSPACE: + return "CUFFT_NO_WORKSPACE"; + + case CUFFT_NOT_IMPLEMENTED: + return "CUFFT_NOT_IMPLEMENTED"; + + case CUFFT_LICENSE_ERROR: + return "CUFFT_LICENSE_ERROR"; + + case CUFFT_NOT_SUPPORTED: + return "CUFFT_NOT_SUPPORTED"; + } + + return ""; +} +#endif + +#ifdef CUSPARSEAPI +// cuSPARSE API errors +static const char* _cudaGetErrorEnum(cusparseStatus_t error) { + switch (error) { + case CUSPARSE_STATUS_SUCCESS: + return "CUSPARSE_STATUS_SUCCESS"; + + case CUSPARSE_STATUS_NOT_INITIALIZED: + return "CUSPARSE_STATUS_NOT_INITIALIZED"; + + case CUSPARSE_STATUS_ALLOC_FAILED: + return "CUSPARSE_STATUS_ALLOC_FAILED"; + + case CUSPARSE_STATUS_INVALID_VALUE: + return "CUSPARSE_STATUS_INVALID_VALUE"; + + case CUSPARSE_STATUS_ARCH_MISMATCH: + return "CUSPARSE_STATUS_ARCH_MISMATCH"; + + case CUSPARSE_STATUS_MAPPING_ERROR: + return "CUSPARSE_STATUS_MAPPING_ERROR"; + + case CUSPARSE_STATUS_EXECUTION_FAILED: + return "CUSPARSE_STATUS_EXECUTION_FAILED"; + + case CUSPARSE_STATUS_INTERNAL_ERROR: + return "CUSPARSE_STATUS_INTERNAL_ERROR"; + + case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + } + + return ""; +} +#endif + +#ifdef CUSOLVER_COMMON_H_ +// cuSOLVER API errors +static const char* _cudaGetErrorEnum(cusolverStatus_t error) { + switch (error) { + case CUSOLVER_STATUS_SUCCESS: + return "CUSOLVER_STATUS_SUCCESS"; + case CUSOLVER_STATUS_NOT_INITIALIZED: + return "CUSOLVER_STATUS_NOT_INITIALIZED"; + case CUSOLVER_STATUS_ALLOC_FAILED: + return "CUSOLVER_STATUS_ALLOC_FAILED"; + case CUSOLVER_STATUS_INVALID_VALUE: + return "CUSOLVER_STATUS_INVALID_VALUE"; + case CUSOLVER_STATUS_ARCH_MISMATCH: + return "CUSOLVER_STATUS_ARCH_MISMATCH"; + case CUSOLVER_STATUS_MAPPING_ERROR: + return "CUSOLVER_STATUS_MAPPING_ERROR"; + case CUSOLVER_STATUS_EXECUTION_FAILED: + return "CUSOLVER_STATUS_EXECUTION_FAILED"; + case CUSOLVER_STATUS_INTERNAL_ERROR: + return "CUSOLVER_STATUS_INTERNAL_ERROR"; + case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + case CUSOLVER_STATUS_NOT_SUPPORTED: + return "CUSOLVER_STATUS_NOT_SUPPORTED "; + case CUSOLVER_STATUS_ZERO_PIVOT: + return "CUSOLVER_STATUS_ZERO_PIVOT"; + case CUSOLVER_STATUS_INVALID_LICENSE: + return "CUSOLVER_STATUS_INVALID_LICENSE"; + } + + return ""; +} +#endif + +#ifdef CURAND_H_ +// cuRAND API errors +static const char* _cudaGetErrorEnum(curandStatus_t error) { + switch (error) { + case CURAND_STATUS_SUCCESS: + return "CURAND_STATUS_SUCCESS"; + + case CURAND_STATUS_VERSION_MISMATCH: + return "CURAND_STATUS_VERSION_MISMATCH"; + + case CURAND_STATUS_NOT_INITIALIZED: + return "CURAND_STATUS_NOT_INITIALIZED"; + + case CURAND_STATUS_ALLOCATION_FAILED: + return "CURAND_STATUS_ALLOCATION_FAILED"; + + case CURAND_STATUS_TYPE_ERROR: + return "CURAND_STATUS_TYPE_ERROR"; + + case CURAND_STATUS_OUT_OF_RANGE: + return "CURAND_STATUS_OUT_OF_RANGE"; + + case CURAND_STATUS_LENGTH_NOT_MULTIPLE: + return "CURAND_STATUS_LENGTH_NOT_MULTIPLE"; + + case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED: + return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED"; + + case CURAND_STATUS_LAUNCH_FAILURE: + return "CURAND_STATUS_LAUNCH_FAILURE"; + + case CURAND_STATUS_PREEXISTING_FAILURE: + return "CURAND_STATUS_PREEXISTING_FAILURE"; + + case CURAND_STATUS_INITIALIZATION_FAILED: + return "CURAND_STATUS_INITIALIZATION_FAILED"; + + case CURAND_STATUS_ARCH_MISMATCH: + return "CURAND_STATUS_ARCH_MISMATCH"; + + case CURAND_STATUS_INTERNAL_ERROR: + return "CURAND_STATUS_INTERNAL_ERROR"; + } + + return ""; +} +#endif + +#ifdef NVJPEGAPI +// nvJPEG API errors +static const char* _cudaGetErrorEnum(nvjpegStatus_t error) { + switch (error) { + case NVJPEG_STATUS_SUCCESS: + return "NVJPEG_STATUS_SUCCESS"; + + case NVJPEG_STATUS_NOT_INITIALIZED: + return "NVJPEG_STATUS_NOT_INITIALIZED"; + + case NVJPEG_STATUS_INVALID_PARAMETER: + return "NVJPEG_STATUS_INVALID_PARAMETER"; + + case NVJPEG_STATUS_BAD_JPEG: + return "NVJPEG_STATUS_BAD_JPEG"; + + case NVJPEG_STATUS_JPEG_NOT_SUPPORTED: + return "NVJPEG_STATUS_JPEG_NOT_SUPPORTED"; + + case NVJPEG_STATUS_ALLOCATOR_FAILURE: + return "NVJPEG_STATUS_ALLOCATOR_FAILURE"; + + case NVJPEG_STATUS_EXECUTION_FAILED: + return "NVJPEG_STATUS_EXECUTION_FAILED"; + + case NVJPEG_STATUS_ARCH_MISMATCH: + return "NVJPEG_STATUS_ARCH_MISMATCH"; + + case NVJPEG_STATUS_INTERNAL_ERROR: + return "NVJPEG_STATUS_INTERNAL_ERROR"; + } + + return ""; +} +#endif + +#ifdef NV_NPPIDEFS_H +// NPP API errors +static const char* _cudaGetErrorEnum(NppStatus error) { + switch (error) { + case NPP_NOT_SUPPORTED_MODE_ERROR: + return "NPP_NOT_SUPPORTED_MODE_ERROR"; + + case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR: + return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR"; + + case NPP_RESIZE_NO_OPERATION_ERROR: + return "NPP_RESIZE_NO_OPERATION_ERROR"; + + case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY: + return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY"; + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000 + + case NPP_BAD_ARG_ERROR: + return "NPP_BAD_ARGUMENT_ERROR"; + + case NPP_COEFF_ERROR: + return "NPP_COEFFICIENT_ERROR"; + + case NPP_RECT_ERROR: + return "NPP_RECTANGLE_ERROR"; + + case NPP_QUAD_ERROR: + return "NPP_QUADRANGLE_ERROR"; + + case NPP_MEM_ALLOC_ERR: + return "NPP_MEMORY_ALLOCATION_ERROR"; + + case NPP_HISTO_NUMBER_OF_LEVELS_ERROR: + return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR"; + + case NPP_INVALID_INPUT: + return "NPP_INVALID_INPUT"; + + case NPP_POINTER_ERROR: + return "NPP_POINTER_ERROR"; + + case NPP_WARNING: + return "NPP_WARNING"; + + case NPP_ODD_ROI_WARNING: + return "NPP_ODD_ROI_WARNING"; +#else + + // These are for CUDA 5.5 or higher + case NPP_BAD_ARGUMENT_ERROR: + return "NPP_BAD_ARGUMENT_ERROR"; + + case NPP_COEFFICIENT_ERROR: + return "NPP_COEFFICIENT_ERROR"; + + case NPP_RECTANGLE_ERROR: + return "NPP_RECTANGLE_ERROR"; + + case NPP_QUADRANGLE_ERROR: + return "NPP_QUADRANGLE_ERROR"; + + case NPP_MEMORY_ALLOCATION_ERR: + return "NPP_MEMORY_ALLOCATION_ERROR"; + + case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR: + return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR"; + + case NPP_INVALID_HOST_POINTER_ERROR: + return "NPP_INVALID_HOST_POINTER_ERROR"; + + case NPP_INVALID_DEVICE_POINTER_ERROR: + return "NPP_INVALID_DEVICE_POINTER_ERROR"; +#endif + + case NPP_LUT_NUMBER_OF_LEVELS_ERROR: + return "NPP_LUT_NUMBER_OF_LEVELS_ERROR"; + + case NPP_TEXTURE_BIND_ERROR: + return "NPP_TEXTURE_BIND_ERROR"; + + case NPP_WRONG_INTERSECTION_ROI_ERROR: + return "NPP_WRONG_INTERSECTION_ROI_ERROR"; + + case NPP_NOT_EVEN_STEP_ERROR: + return "NPP_NOT_EVEN_STEP_ERROR"; + + case NPP_INTERPOLATION_ERROR: + return "NPP_INTERPOLATION_ERROR"; + + case NPP_RESIZE_FACTOR_ERROR: + return "NPP_RESIZE_FACTOR_ERROR"; + + case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR: + return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR"; + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000 + + case NPP_MEMFREE_ERR: + return "NPP_MEMFREE_ERR"; + + case NPP_MEMSET_ERR: + return "NPP_MEMSET_ERR"; + + case NPP_MEMCPY_ERR: + return "NPP_MEMCPY_ERROR"; + + case NPP_MIRROR_FLIP_ERR: + return "NPP_MIRROR_FLIP_ERR"; +#else + + case NPP_MEMFREE_ERROR: + return "NPP_MEMFREE_ERROR"; + + case NPP_MEMSET_ERROR: + return "NPP_MEMSET_ERROR"; + + case NPP_MEMCPY_ERROR: + return "NPP_MEMCPY_ERROR"; + + case NPP_MIRROR_FLIP_ERROR: + return "NPP_MIRROR_FLIP_ERROR"; +#endif + + case NPP_ALIGNMENT_ERROR: + return "NPP_ALIGNMENT_ERROR"; + + case NPP_STEP_ERROR: + return "NPP_STEP_ERROR"; + + case NPP_SIZE_ERROR: + return "NPP_SIZE_ERROR"; + + case NPP_NULL_POINTER_ERROR: + return "NPP_NULL_POINTER_ERROR"; + + case NPP_CUDA_KERNEL_EXECUTION_ERROR: + return "NPP_CUDA_KERNEL_EXECUTION_ERROR"; + + case NPP_NOT_IMPLEMENTED_ERROR: + return "NPP_NOT_IMPLEMENTED_ERROR"; + + case NPP_ERROR: + return "NPP_ERROR"; + + case NPP_SUCCESS: + return "NPP_SUCCESS"; + + case NPP_WRONG_INTERSECTION_QUAD_WARNING: + return "NPP_WRONG_INTERSECTION_QUAD_WARNING"; + + case NPP_MISALIGNED_DST_ROI_WARNING: + return "NPP_MISALIGNED_DST_ROI_WARNING"; + + case NPP_AFFINE_QUAD_INCORRECT_WARNING: + return "NPP_AFFINE_QUAD_INCORRECT_WARNING"; + + case NPP_DOUBLE_SIZE_WARNING: + return "NPP_DOUBLE_SIZE_WARNING"; + + case NPP_WRONG_INTERSECTION_ROI_WARNING: + return "NPP_WRONG_INTERSECTION_ROI_WARNING"; + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000 + /* These are 6.0 or higher */ + case NPP_LUT_PALETTE_BITSIZE_ERROR: + return "NPP_LUT_PALETTE_BITSIZE_ERROR"; + + case NPP_ZC_MODE_NOT_SUPPORTED_ERROR: + return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR"; + + case NPP_QUALITY_INDEX_ERROR: + return "NPP_QUALITY_INDEX_ERROR"; + + case NPP_CHANNEL_ORDER_ERROR: + return "NPP_CHANNEL_ORDER_ERROR"; + + case NPP_ZERO_MASK_VALUE_ERROR: + return "NPP_ZERO_MASK_VALUE_ERROR"; + + case NPP_NUMBER_OF_CHANNELS_ERROR: + return "NPP_NUMBER_OF_CHANNELS_ERROR"; + + case NPP_COI_ERROR: + return "NPP_COI_ERROR"; + + case NPP_DIVISOR_ERROR: + return "NPP_DIVISOR_ERROR"; + + case NPP_CHANNEL_ERROR: + return "NPP_CHANNEL_ERROR"; + + case NPP_STRIDE_ERROR: + return "NPP_STRIDE_ERROR"; + + case NPP_ANCHOR_ERROR: + return "NPP_ANCHOR_ERROR"; + + case NPP_MASK_SIZE_ERROR: + return "NPP_MASK_SIZE_ERROR"; + + case NPP_MOMENT_00_ZERO_ERROR: + return "NPP_MOMENT_00_ZERO_ERROR"; + + case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR: + return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR"; + + case NPP_THRESHOLD_ERROR: + return "NPP_THRESHOLD_ERROR"; + + case NPP_CONTEXT_MATCH_ERROR: + return "NPP_CONTEXT_MATCH_ERROR"; + + case NPP_FFT_FLAG_ERROR: + return "NPP_FFT_FLAG_ERROR"; + + case NPP_FFT_ORDER_ERROR: + return "NPP_FFT_ORDER_ERROR"; + + case NPP_SCALE_RANGE_ERROR: + return "NPP_SCALE_RANGE_ERROR"; + + case NPP_DATA_TYPE_ERROR: + return "NPP_DATA_TYPE_ERROR"; + + case NPP_OUT_OFF_RANGE_ERROR: + return "NPP_OUT_OFF_RANGE_ERROR"; + + case NPP_DIVIDE_BY_ZERO_ERROR: + return "NPP_DIVIDE_BY_ZERO_ERROR"; + + case NPP_RANGE_ERROR: + return "NPP_RANGE_ERROR"; + + case NPP_NO_MEMORY_ERROR: + return "NPP_NO_MEMORY_ERROR"; + + case NPP_ERROR_RESERVED: + return "NPP_ERROR_RESERVED"; + + case NPP_NO_OPERATION_WARNING: + return "NPP_NO_OPERATION_WARNING"; + + case NPP_DIVIDE_BY_ZERO_WARNING: + return "NPP_DIVIDE_BY_ZERO_WARNING"; +#endif + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x7000 + /* These are 7.0 or higher */ + case NPP_OVERFLOW_ERROR: + return "NPP_OVERFLOW_ERROR"; + + case NPP_CORRUPTED_DATA_ERROR: + return "NPP_CORRUPTED_DATA_ERROR"; +#endif + } + + return ""; +} +#endif + +template +void check(T result, const char* const func, const char* const file, + const int line) { + if (result) { + fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line, + static_cast(result), _cudaGetErrorEnum(result), func); + exit(EXIT_FAILURE); + } +} + +#ifdef __DRIVER_TYPES_H__ +// This will output the proper CUDA error strings in the event +// that a CUDA host call returns an error +#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__) + +// This will output the proper error string when calling cudaGetLastError +#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__) + +inline void __getLastCudaError(const char* errorMessage, const char* file, + const int line) { + cudaError_t err = cudaGetLastError(); + + if (cudaSuccess != err) { + fprintf(stderr, + "%s(%i) : getLastCudaError() CUDA error :" + " %s : (%d) %s.\n", + file, line, errorMessage, static_cast(err), + cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } +} + +// This will only print the proper error string when calling cudaGetLastError +// but not exit program incase error detected. +#define printLastCudaError(msg) __printLastCudaError(msg, __FILE__, __LINE__) + +inline void __printLastCudaError(const char* errorMessage, const char* file, + const int line) { + cudaError_t err = cudaGetLastError(); + + if (cudaSuccess != err) { + fprintf(stderr, + "%s(%i) : getLastCudaError() CUDA error :" + " %s : (%d) %s.\n", + file, line, errorMessage, static_cast(err), + cudaGetErrorString(err)); + } +} +#endif + +#ifndef MAX +#define MAX(a, b) (a > b ? a : b) +#endif + +// Float To Int conversion +inline int ftoi(float value) { + return (value >= 0 ? static_cast(value + 0.5) + : static_cast(value - 0.5)); +} + +// Beginning of GPU Architecture definitions +inline int _ConvertSMVer2Cores(int major, int minor) { + // Defines for GPU Architecture types (using the SM version to determine + // the # of cores per SM + typedef struct { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, + // and m = SM minor version + int Cores; + } sSMtoCores; + + sSMtoCores nGpuArchCoresPerSM[] = { + {0x30, 192}, {0x32, 192}, {0x35, 192}, {0x37, 192}, {0x50, 128}, + {0x52, 128}, {0x53, 128}, {0x60, 64}, {0x61, 128}, {0x62, 128}, + {0x70, 64}, {0x72, 64}, {0x75, 64}, {0x80, 64}, {0x86, 128}, + {0x87, 128}, {0x89, 128}, {0x90, 128}, {-1, -1}}; + + int index = 0; + + while (nGpuArchCoresPerSM[index].SM != -1) { + if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) + return nGpuArchCoresPerSM[index].Cores; + + index++; + } + + // If we don't find the values, we default use the previous one + // to run properly + printf("MapSMtoCores for SM %d.%d is undefined." + " Default to use %d Cores/SM\n", + major, minor, nGpuArchCoresPerSM[index - 1].Cores); + return nGpuArchCoresPerSM[index - 1].Cores; +} + +inline const char* _ConvertSMVer2ArchName(int major, int minor) { + // Defines for GPU Architecture types (using the SM version to determine + // the GPU Arch name) + typedef struct { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, + // and m = SM minor version + const char* name; + } sSMtoArchName; + + sSMtoArchName nGpuArchNameSM[] = { + {0x30, "Kepler"}, {0x32, "Kepler"}, {0x35, "Kepler"}, + {0x37, "Kepler"}, {0x50, "Maxwell"}, {0x52, "Maxwell"}, + {0x53, "Maxwell"}, {0x60, "Pascal"}, {0x61, "Pascal"}, + {0x62, "Pascal"}, {0x70, "Volta"}, {0x72, "Xavier"}, + {0x75, "Turing"}, {0x80, "Ampere"}, {0x86, "Ampere"}, + {0x87, "Ampere"}, {0x89, "Ada"}, {0x90, "Hopper"}, + {-1, "Graphics Device"}}; + + int index = 0; + + while (nGpuArchNameSM[index].SM != -1) { + if (nGpuArchNameSM[index].SM == ((major << 4) + minor)) + return nGpuArchNameSM[index].name; + + index++; + } + + // If we don't find the values, we default use the previous one + // to run properly + printf("MapSMtoArchName for SM %d.%d is undefined." + " Default to use %s\n", + major, minor, nGpuArchNameSM[index - 1].name); + return nGpuArchNameSM[index - 1].name; +} +// end of GPU Architecture definitions + +#ifdef __CUDA_RUNTIME_H__ +// General GPU Device CUDA Initialization +inline int gpuDeviceInit(int devID) { + int device_count; + checkCudaErrors(cudaGetDeviceCount(&device_count)); + + if (device_count == 0) { + fprintf(stderr, "gpuDeviceInit() CUDA error: " + "no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + if (devID < 0) + devID = 0; + + if (devID > device_count - 1) { + fprintf(stderr, "\n"); + fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", + device_count); + fprintf(stderr, + ">> gpuDeviceInit (-device=%d) is not a valid" + " GPU device. <<\n", + devID); + fprintf(stderr, "\n"); + return -devID; + } + + int computeMode = -1, major = 0, minor = 0; + checkCudaErrors( + cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, devID)); + checkCudaErrors( + cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID)); + checkCudaErrors( + cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID)); + if (computeMode == cudaComputeModeProhibited) { + fprintf(stderr, "Error: device is running in , no threads can use cudaSetDevice().\n"); + return -1; + } + + if (major < 1) { + fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n"); + exit(EXIT_FAILURE); + } + + checkCudaErrors(cudaSetDevice(devID)); + printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, + _ConvertSMVer2ArchName(major, minor)); + + return devID; +} + +// This function returns the best GPU (with maximum GFLOPS) +inline int gpuGetMaxGflopsDeviceId() { + int current_device = 0, sm_per_multiproc = 0; + int max_perf_device = 0; + int device_count = 0; + int devices_prohibited = 0; + + uint64_t max_compute_perf = 0; + checkCudaErrors(cudaGetDeviceCount(&device_count)); + + if (device_count == 0) { + fprintf(stderr, "gpuGetMaxGflopsDeviceId() CUDA error:" + " no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + // Find the best CUDA capable GPU device + current_device = 0; + + while (current_device < device_count) { + int computeMode = -1, major = 0, minor = 0; + checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, + current_device)); + checkCudaErrors(cudaDeviceGetAttribute( + &major, cudaDevAttrComputeCapabilityMajor, current_device)); + checkCudaErrors(cudaDeviceGetAttribute( + &minor, cudaDevAttrComputeCapabilityMinor, current_device)); + + // If this GPU is not running on Compute Mode prohibited, + // then we can add it to the list + if (computeMode != cudaComputeModeProhibited) { + if (major == 9999 && minor == 9999) + sm_per_multiproc = 1; + else + sm_per_multiproc = _ConvertSMVer2Cores(major, minor); + int multiProcessorCount = 0, clockRate = 0; + checkCudaErrors(cudaDeviceGetAttribute(&multiProcessorCount, + cudaDevAttrMultiProcessorCount, + current_device)); + cudaError_t result = cudaDeviceGetAttribute( + &clockRate, cudaDevAttrClockRate, current_device); + if (result != cudaSuccess) { + // If cudaDevAttrClockRate attribute is not supported we + // set clockRate as 1, to consider GPU with most SMs and CUDA Cores. + if (result == cudaErrorInvalidValue) { + clockRate = 1; + } else { + fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \n", __FILE__, + __LINE__, static_cast(result), + _cudaGetErrorEnum(result)); + exit(EXIT_FAILURE); + } + } + uint64_t compute_perf = + (uint64_t)multiProcessorCount * sm_per_multiproc * clockRate; + + if (compute_perf > max_compute_perf) { + max_compute_perf = compute_perf; + max_perf_device = current_device; + } + } else { + devices_prohibited++; + } + + ++current_device; + } + + if (devices_prohibited == device_count) { + fprintf(stderr, "gpuGetMaxGflopsDeviceId() CUDA error:" + " all devices have compute mode prohibited.\n"); + exit(EXIT_FAILURE); + } + + return max_perf_device; +} + +// Initialization code to find the best CUDA Device +inline int findCudaDevice(int argc, const char** argv) { + int devID = 0; + + // If the command-line has a device number specified, use it + if (checkCmdLineFlag(argc, argv, "device")) { + devID = getCmdLineArgumentInt(argc, argv, "device="); + + if (devID < 0) { + printf("Invalid command line parameter\n "); + exit(EXIT_FAILURE); + } else { + devID = gpuDeviceInit(devID); + + if (devID < 0) { + printf("exiting...\n"); + exit(EXIT_FAILURE); + } + } + } else { + // Otherwise pick the device with highest Gflops/s + devID = gpuGetMaxGflopsDeviceId(); + checkCudaErrors(cudaSetDevice(devID)); + int major = 0, minor = 0; + checkCudaErrors(cudaDeviceGetAttribute( + &major, cudaDevAttrComputeCapabilityMajor, devID)); + checkCudaErrors(cudaDeviceGetAttribute( + &minor, cudaDevAttrComputeCapabilityMinor, devID)); + printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, + _ConvertSMVer2ArchName(major, minor), major, minor); + } + + return devID; +} + +inline int findIntegratedGPU() { + int current_device = 0; + int device_count = 0; + int devices_prohibited = 0; + + checkCudaErrors(cudaGetDeviceCount(&device_count)); + + if (device_count == 0) { + fprintf(stderr, "CUDA error: no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + // Find the integrated GPU which is compute capable + while (current_device < device_count) { + int computeMode = -1, integrated = -1; + checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, + current_device)); + checkCudaErrors(cudaDeviceGetAttribute(&integrated, cudaDevAttrIntegrated, + current_device)); + // If GPU is integrated and is not running on Compute Mode prohibited, + // then cuda can map to GLES resource + if (integrated && (computeMode != cudaComputeModeProhibited)) { + checkCudaErrors(cudaSetDevice(current_device)); + + int major = 0, minor = 0; + checkCudaErrors(cudaDeviceGetAttribute( + &major, cudaDevAttrComputeCapabilityMajor, current_device)); + checkCudaErrors(cudaDeviceGetAttribute( + &minor, cudaDevAttrComputeCapabilityMinor, current_device)); + printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", + current_device, _ConvertSMVer2ArchName(major, minor), major, + minor); + + return current_device; + } else { + devices_prohibited++; + } + + current_device++; + } + + if (devices_prohibited == device_count) { + fprintf(stderr, "CUDA error:" + " No GLES-CUDA Interop capable GPU found.\n"); + exit(EXIT_FAILURE); + } + + return -1; +} + +// General check for CUDA GPU SM Capabilities +inline bool checkCudaCapabilities(int major_version, int minor_version) { + int dev; + int major = 0, minor = 0; + + checkCudaErrors(cudaGetDevice(&dev)); + checkCudaErrors( + cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev)); + checkCudaErrors( + cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, dev)); + + if ((major > major_version) || + (major == major_version && minor >= minor_version)) { + printf(" Device %d: <%16s >, Compute SM %d.%d detected\n", dev, + _ConvertSMVer2ArchName(major, minor), major, minor); + return true; + } else { + printf(" No GPU device was found that can support " + "CUDA compute capability %d.%d.\n", + major_version, minor_version); + return false; + } +} +#endif + +// end of CUDA Helper Functions + +#endif // COMMON_HELPER_CUDA_H_ diff --git a/demos/CUDA/BlackScholes/helper/helper_functions.h b/demos/CUDA/BlackScholes/helper/helper_functions.h new file mode 100644 index 000000000..bd40ba43e --- /dev/null +++ b/demos/CUDA/BlackScholes/helper/helper_functions.h @@ -0,0 +1,59 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// These are helper functions for the SDK samples (string parsing, +// timers, image helpers, etc) +#ifndef COMMON_HELPER_FUNCTIONS_H_ +#define COMMON_HELPER_FUNCTIONS_H_ + +#ifdef WIN32 +#pragma warning(disable : 4996) +#endif + +// includes, project +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +// includes, timer, string parsing, image helpers +#include // helper functions for image compare, dump, data comparisons +#include // helper functions for string parsing +#include // helper functions for timers + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +#endif // COMMON_HELPER_FUNCTIONS_H_ diff --git a/demos/CUDA/BlackScholes/helper/helper_image.h b/demos/CUDA/BlackScholes/helper/helper_image.h new file mode 100644 index 000000000..33fbf1b62 --- /dev/null +++ b/demos/CUDA/BlackScholes/helper/helper_image.h @@ -0,0 +1,961 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// These are helper functions for the SDK samples (image,bitmap) +#ifndef COMMON_HELPER_IMAGE_H_ +#define COMMON_HELPER_IMAGE_H_ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#ifndef MIN +#define MIN(a, b) ((a < b) ? a : b) +#endif +#ifndef MAX +#define MAX(a, b) ((a > b) ? a : b) +#endif + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +#include + +// namespace unnamed (internal) +namespace helper_image_internal { +//! size of PGM file header +const unsigned int PGMHeaderSize = 0x40; + +// types + +//! Data converter from unsigned char / unsigned byte to type T +template struct ConverterFromUByte; + +//! Data converter from unsigned char / unsigned byte +template <> struct ConverterFromUByte { + //! Conversion operator + //! @return converted value + //! @param val value to convert + float operator()(const unsigned char& val) { + return static_cast(val); + } +}; + +//! Data converter from unsigned char / unsigned byte to float +template <> struct ConverterFromUByte { + //! Conversion operator + //! @return converted value + //! @param val value to convert + float operator()(const unsigned char& val) { + return static_cast(val) / 255.0f; + } +}; + +//! Data converter from unsigned char / unsigned byte to type T +template struct ConverterToUByte; + +//! Data converter from unsigned char / unsigned byte to unsigned int +template <> struct ConverterToUByte { + //! Conversion operator (essentially a passthru + //! @return converted value + //! @param val value to convert + unsigned char operator()(const unsigned char& val) { return val; } +}; + +//! Data converter from unsigned char / unsigned byte to unsigned int +template <> struct ConverterToUByte { + //! Conversion operator + //! @return converted value + //! @param val value to convert + unsigned char operator()(const float& val) { + return static_cast(val * 255.0f); + } +}; +} // namespace helper_image_internal + +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) +#ifndef FOPEN +#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode) +#endif +#ifndef FOPEN_FAIL +#define FOPEN_FAIL(result) (result != 0) +#endif +#ifndef SSCANF +#define SSCANF sscanf_s +#endif +#else +#ifndef FOPEN +#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode)) +#endif +#ifndef FOPEN_FAIL +#define FOPEN_FAIL(result) (result == NULL) +#endif +#ifndef SSCANF +#define SSCANF sscanf +#endif +#endif + +inline bool __loadPPM(const char* file, unsigned char** data, unsigned int* w, + unsigned int* h, unsigned int* channels) { + FILE* fp = NULL; + + if (FOPEN_FAIL(FOPEN(fp, file, "rb"))) { + std::cerr << "__LoadPPM() : Failed to open file: " << file << std::endl; + return false; + } + + // check header + char header[helper_image_internal::PGMHeaderSize]; + + if (fgets(header, helper_image_internal::PGMHeaderSize, fp) == NULL) { + std::cerr << "__LoadPPM() : reading PGM header returned NULL" << std::endl; + return false; + } + + if (strncmp(header, "P5", 2) == 0) { + *channels = 1; + } else if (strncmp(header, "P6", 2) == 0) { + *channels = 3; + } else { + std::cerr << "__LoadPPM() : File is not a PPM or PGM image" << std::endl; + *channels = 0; + return false; + } + + // parse header, read maxval, width and height + unsigned int width = 0; + unsigned int height = 0; + unsigned int maxval = 0; + unsigned int i = 0; + + while (i < 3) { + if (fgets(header, helper_image_internal::PGMHeaderSize, fp) == NULL) { + std::cerr << "__LoadPPM() : reading PGM header returned NULL" + << std::endl; + return false; + } + + if (header[0] == '#') + continue; + + if (i == 0) + i += SSCANF(header, "%u %u %u", &width, &height, &maxval); + else if (i == 1) + i += SSCANF(header, "%u %u", &height, &maxval); + else if (i == 2) + i += SSCANF(header, "%u", &maxval); + } + + // check if given handle for the data is initialized + if (NULL != *data) { + if (*w != width || *h != height) + std::cerr << "__LoadPPM() : Invalid image dimensions." << std::endl; + } else { + *data = (unsigned char*)malloc(sizeof(unsigned char) * width * height * + *channels); + *w = width; + *h = height; + } + + // read and close file + if (fread(*data, sizeof(unsigned char), width * height * *channels, fp) == 0) + std::cerr << "__LoadPPM() read data returned error." << std::endl; + + fclose(fp); + + return true; +} + +template +inline bool sdkLoadPGM(const char* file, T** data, unsigned int* w, + unsigned int* h) { + unsigned char* idata = NULL; + unsigned int channels; + + if (true != __loadPPM(file, &idata, w, h, &channels)) + return false; + + unsigned int size = *w * *h * channels; + + // initialize mem if necessary + // the correct size is checked / set in loadPGMc() + if (NULL == *data) + *data = reinterpret_cast(malloc(sizeof(T) * size)); + + // copy and cast data + std::transform(idata, idata + size, *data, + helper_image_internal::ConverterFromUByte()); + + free(idata); + + return true; +} + +template +inline bool sdkLoadPPM4(const char* file, T** data, unsigned int* w, + unsigned int* h) { + unsigned char* idata = 0; + unsigned int channels; + + if (__loadPPM(file, &idata, w, h, &channels)) { + // pad 4th component + int size = *w * *h; + // keep the original pointer + unsigned char* idata_orig = idata; + *data = reinterpret_cast(malloc(sizeof(T) * size * 4)); + unsigned char* ptr = *data; + + for (int i = 0; i < size; i++) { + *ptr++ = *idata++; + *ptr++ = *idata++; + *ptr++ = *idata++; + *ptr++ = 0; + } + + free(idata_orig); + return true; + } else { + free(idata); + return false; + } +} + +inline bool __savePPM(const char* file, unsigned char* data, unsigned int w, + unsigned int h, unsigned int channels) { + assert(NULL != data); + assert(w > 0); + assert(h > 0); + + std::fstream fh(file, std::fstream::out | std::fstream::binary); + + if (fh.bad()) { + std::cerr << "__savePPM() : Opening file failed." << std::endl; + return false; + } + + if (channels == 1) { + fh << "P5\n"; + } else if (channels == 3) { + fh << "P6\n"; + } else { + std::cerr << "__savePPM() : Invalid number of channels." << std::endl; + return false; + } + + fh << w << "\n" << h << "\n" << 0xff << std::endl; + + for (unsigned int i = 0; (i < (w * h * channels)) && fh.good(); ++i) + fh << data[i]; + + fh.flush(); + + if (fh.bad()) { + std::cerr << "__savePPM() : Writing data failed." << std::endl; + return false; + } + + fh.close(); + + return true; +} + +template +inline bool sdkSavePGM(const char* file, T* data, unsigned int w, + unsigned int h) { + unsigned int size = w * h; + unsigned char* idata = (unsigned char*)malloc(sizeof(unsigned char) * size); + + std::transform(data, data + size, idata, + helper_image_internal::ConverterToUByte()); + + // write file + bool result = __savePPM(file, idata, w, h, 1); + + // cleanup + free(idata); + + return result; +} + +inline bool sdkSavePPM4ub(const char* file, unsigned char* data, unsigned int w, + unsigned int h) { + // strip 4th component + int size = w * h; + unsigned char* ndata = + (unsigned char*)malloc(sizeof(unsigned char) * size * 3); + unsigned char* ptr = ndata; + + for (int i = 0; i < size; i++) { + *ptr++ = *data++; + *ptr++ = *data++; + *ptr++ = *data++; + data++; + } + + bool result = __savePPM(file, ndata, w, h, 3); + free(ndata); + return result; +} + +////////////////////////////////////////////////////////////////////////////// +//! Read file \filename and return the data +//! @return bool if reading the file succeeded, otherwise false +//! @param filename name of the source file +//! @param data uninitialized pointer, returned initialized and pointing to +//! the data read +//! @param len number of data elements in data, -1 on error +////////////////////////////////////////////////////////////////////////////// +template +inline bool sdkReadFile(const char* filename, T** data, unsigned int* len, + bool verbose) { + // check input arguments + assert(NULL != filename); + assert(NULL != len); + + // intermediate storage for the data read + std::vector data_read; + + // open file for reading + FILE* fh = NULL; + + // check if filestream is valid + if (FOPEN_FAIL(FOPEN(fh, filename, "r"))) { + printf("Unable to open input file: %s\n", filename); + return false; + } + + // read all data elements + T token; + + while (!feof(fh)) { + fscanf(fh, "%f", &token); + data_read.push_back(token); + } + + // the last element is read twice + data_read.pop_back(); + fclose(fh); + + // check if the given handle is already initialized + if (NULL != *data) { + if (*len != data_read.size()) { + std::cerr << "sdkReadFile() : Initialized memory given but " + << "size mismatch with signal read " + << "(data read / data init = " << (unsigned int)data_read.size() + << " / " << *len << ")" << std::endl; + + return false; + } + } else { + // allocate storage for the data read + *data = reinterpret_cast(malloc(sizeof(T) * data_read.size())); + // store signal size + *len = static_cast(data_read.size()); + } + + // copy data + memcpy(*data, &data_read.front(), sizeof(T) * data_read.size()); + + return true; +} + +////////////////////////////////////////////////////////////////////////////// +//! Read file \filename and return the data +//! @return bool if reading the file succeeded, otherwise false +//! @param filename name of the source file +//! @param data uninitialized pointer, returned initialized and pointing to +//! the data read +//! @param len number of data elements in data, -1 on error +////////////////////////////////////////////////////////////////////////////// +template +inline bool sdkReadFileBlocks(const char* filename, T** data, unsigned int* len, + unsigned int block_num, unsigned int block_size, + bool verbose) { + // check input arguments + assert(NULL != filename); + assert(NULL != len); + + // open file for reading + FILE* fh = fopen(filename, "rb"); + + if (fh == NULL && verbose) { + std::cerr << "sdkReadFile() : Opening file failed." << std::endl; + return false; + } + + // check if the given handle is already initialized + // allocate storage for the data read + data[block_num] = reinterpret_cast(malloc(block_size)); + + // read all data elements + fseek(fh, block_num * block_size, SEEK_SET); + *len = fread(data[block_num], sizeof(T), block_size / sizeof(T), fh); + + fclose(fh); + + return true; +} + +////////////////////////////////////////////////////////////////////////////// +//! Write a data file \filename +//! @return true if writing the file succeeded, otherwise false +//! @param filename name of the source file +//! @param data data to write +//! @param len number of data elements in data, -1 on error +//! @param epsilon epsilon for comparison +////////////////////////////////////////////////////////////////////////////// +template +inline bool sdkWriteFile(const char* filename, const T* data, unsigned int len, + const S epsilon, bool verbose, bool append = false) { + assert(NULL != filename); + assert(NULL != data); + + // open file for writing + // if (append) { + std::fstream fh(filename, std::fstream::out | std::fstream::ate); + + if (verbose) { + std::cerr << "sdkWriteFile() : Open file " << filename + << " for write/append." << std::endl; + } + + /* } else { + std::fstream fh(filename, std::fstream::out); + if (verbose) { + std::cerr << "sdkWriteFile() : Open file " << filename << " for + write." << std::endl; + } + } + */ + + // check if filestream is valid + if (!fh.good()) { + if (verbose) + std::cerr << "sdkWriteFile() : Opening file failed." << std::endl; + + return false; + } + + // first write epsilon + fh << "# " << epsilon << "\n"; + + // write data + for (unsigned int i = 0; (i < len) && (fh.good()); ++i) + fh << data[i] << ' '; + + // Check if writing succeeded + if (!fh.good()) { + if (verbose) + std::cerr << "sdkWriteFile() : Writing file failed." << std::endl; + + return false; + } + + // file ends with nl + fh << std::endl; + + return true; +} + +////////////////////////////////////////////////////////////////////////////// +//! Compare two arrays of arbitrary type +//! @return true if \a reference and \a data are identical, otherwise false +//! @param reference timer_interface to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//! @param epsilon epsilon to use for the comparison +////////////////////////////////////////////////////////////////////////////// +template +inline bool compareData(const T* reference, const T* data, + const unsigned int len, const S epsilon, + const float threshold) { + assert(epsilon >= 0); + + bool result = true; + unsigned int error_count = 0; + + for (unsigned int i = 0; i < len; ++i) { + float diff = static_cast(reference[i]) - static_cast(data[i]); + bool comp = (diff <= epsilon) && (diff >= -epsilon); + result &= comp; + + error_count += !comp; + +#if 0 + + if (!comp) { + std::cerr << "ERROR, i = " << i << ",\t " + << reference[i] << " / " + << data[i] + << " (reference / data)\n"; + } + +#endif + } + + if (threshold == 0.0f) { + return (result) ? true : false; + } else { + if (error_count) { + printf("%4.2f(%%) of bytes mismatched (count=%d)\n", + static_cast(error_count) * 100 / static_cast(len), + error_count); + } + + return (len * threshold > error_count) ? true : false; + } +} + +#ifndef __MIN_EPSILON_ERROR +#define __MIN_EPSILON_ERROR 1e-3f +#endif + +////////////////////////////////////////////////////////////////////////////// +//! Compare two arrays of arbitrary type +//! @return true if \a reference and \a data are identical, otherwise false +//! @param reference handle to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//! @param epsilon epsilon to use for the comparison +//! @param epsilon threshold % of (# of bytes) for pass/fail +////////////////////////////////////////////////////////////////////////////// +template +inline bool compareDataAsFloatThreshold(const T* reference, const T* data, + const unsigned int len, const S epsilon, + const float threshold) { + assert(epsilon >= 0); + + // If we set epsilon to be 0, let's set a minimum threshold + float max_error = MAX((float)epsilon, __MIN_EPSILON_ERROR); + int error_count = 0; + bool result = true; + + for (unsigned int i = 0; i < len; ++i) { + float diff = + fabs(static_cast(reference[i]) - static_cast(data[i])); + bool comp = (diff < max_error); + result &= comp; + + if (!comp) + error_count++; + } + + if (threshold == 0.0f) { + if (error_count) + printf("total # of errors = %d\n", error_count); + + return (error_count == 0) ? true : false; + } else { + if (error_count) { + printf("%4.2f(%%) of bytes mismatched (count=%d)\n", + static_cast(error_count) * 100 / static_cast(len), + error_count); + } + + return ((len * threshold > error_count) ? true : false); + } +} + +inline void sdkDumpBin(void* data, unsigned int bytes, const char* filename) { + printf("sdkDumpBin: <%s>\n", filename); + FILE* fp; + FOPEN(fp, filename, "wb"); + fwrite(data, bytes, 1, fp); + fflush(fp); + fclose(fp); +} + +inline bool sdkCompareBin2BinUint(const char* src_file, const char* ref_file, + unsigned int nelements, const float epsilon, + const float threshold, char* exec_path) { + unsigned int *src_buffer, *ref_buffer; + FILE *src_fp = NULL, *ref_fp = NULL; + + uint64_t error_count = 0; + size_t fsize = 0; + + if (FOPEN_FAIL(FOPEN(src_fp, src_file, "rb"))) { + printf("compareBin2Bin unable to open src_file: %s\n", + src_file); + error_count++; + } + + char* ref_file_path = sdkFindFilePath(ref_file, exec_path); + + if (ref_file_path == NULL) { + printf("compareBin2Bin unable to find <%s> in <%s>\n", + ref_file, exec_path); + printf(">>> Check info.xml and [project//data] folder <%s> <<<\n", + ref_file); + printf("Aborting comparison!\n"); + printf(" FAILED\n"); + error_count++; + + if (src_fp) + fclose(src_fp); + + if (ref_fp) + fclose(ref_fp); + } else { + if (FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb"))) { + printf("compareBin2Bin " + " unable to open ref_file: %s\n", + ref_file_path); + error_count++; + } + + if (src_fp && ref_fp) { + src_buffer = (unsigned int*)malloc(nelements * sizeof(unsigned int)); + ref_buffer = (unsigned int*)malloc(nelements * sizeof(unsigned int)); + + fsize = fread(src_buffer, nelements, sizeof(unsigned int), src_fp); + fsize = fread(ref_buffer, nelements, sizeof(unsigned int), ref_fp); + + printf("> compareBin2Bin nelements=%d," + " epsilon=%4.2f, threshold=%4.2f\n", + nelements, epsilon, threshold); + printf(" src_file <%s>, size=%d bytes\n", src_file, + static_cast(fsize)); + printf(" ref_file <%s>, size=%d bytes\n", ref_file_path, + static_cast(fsize)); + + if (!compareData(ref_buffer, src_buffer, nelements, + epsilon, threshold)) + error_count++; + + fclose(src_fp); + fclose(ref_fp); + + free(src_buffer); + free(ref_buffer); + } else { + if (src_fp) + fclose(src_fp); + + if (ref_fp) + fclose(ref_fp); + } + } + + if (error_count == 0) + printf(" OK\n"); + else + printf(" FAILURE: %d errors...\n", (unsigned int)error_count); + + return (error_count == 0); // returns true if all pixels pass +} + +inline bool sdkCompareBin2BinFloat(const char* src_file, const char* ref_file, + unsigned int nelements, const float epsilon, + const float threshold, char* exec_path) { + float *src_buffer = NULL, *ref_buffer = NULL; + FILE *src_fp = NULL, *ref_fp = NULL; + size_t fsize = 0; + + uint64_t error_count = 0; + + if (FOPEN_FAIL(FOPEN(src_fp, src_file, "rb"))) { + printf("compareBin2Bin unable to open src_file: %s\n", src_file); + error_count = 1; + } + + char* ref_file_path = sdkFindFilePath(ref_file, exec_path); + + if (ref_file_path == NULL) { + printf("compareBin2Bin unable to find <%s> in <%s>\n", ref_file, + exec_path); + printf(">>> Check info.xml and [project//data] folder <%s> <<<\n", + exec_path); + printf("Aborting comparison!\n"); + printf(" FAILED\n"); + error_count++; + + if (src_fp) + fclose(src_fp); + + if (ref_fp) + fclose(ref_fp); + } else { + if (FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb"))) { + printf("compareBin2Bin unable to open ref_file: %s\n", + ref_file_path); + error_count = 1; + } + + if (src_fp && ref_fp) { + src_buffer = reinterpret_cast(malloc(nelements * sizeof(float))); + ref_buffer = reinterpret_cast(malloc(nelements * sizeof(float))); + + printf("> compareBin2Bin nelements=%d, epsilon=%4.2f," + " threshold=%4.2f\n", + nelements, epsilon, threshold); + fsize = fread(src_buffer, sizeof(float), nelements, src_fp); + printf(" src_file <%s>, size=%d bytes\n", src_file, + static_cast(fsize * sizeof(float))); + fsize = fread(ref_buffer, sizeof(float), nelements, ref_fp); + printf(" ref_file <%s>, size=%d bytes\n", ref_file_path, + static_cast(fsize * sizeof(float))); + + if (!compareDataAsFloatThreshold( + ref_buffer, src_buffer, nelements, epsilon, threshold)) + error_count++; + + fclose(src_fp); + fclose(ref_fp); + + free(src_buffer); + free(ref_buffer); + } else { + if (src_fp) + fclose(src_fp); + + if (ref_fp) + fclose(ref_fp); + } + } + + if (error_count == 0) + printf(" OK\n"); + else + printf(" FAILURE: %d errors...\n", (unsigned int)error_count); + + return (error_count == 0); // returns true if all pixels pass +} + +inline bool sdkCompareL2fe(const float* reference, const float* data, + const unsigned int len, const float epsilon) { + assert(epsilon >= 0); + + float error = 0; + float ref = 0; + + for (unsigned int i = 0; i < len; ++i) { + float diff = reference[i] - data[i]; + error += diff * diff; + ref += reference[i] * reference[i]; + } + + float normRef = sqrtf(ref); + + if (fabs(ref) < 1e-7) { +#ifdef _DEBUG + std::cerr << "ERROR, reference l2-norm is 0\n"; +#endif + return false; + } + + float normError = sqrtf(error); + error = normError / normRef; + bool result = error < epsilon; +#ifdef _DEBUG + + if (!result) { + std::cerr << "ERROR, l2-norm error " << error << " is greater than epsilon " + << epsilon << "\n"; + } + +#endif + + return result; +} + +inline bool sdkLoadPPMub(const char* file, unsigned char** data, + unsigned int* w, unsigned int* h) { + unsigned int channels; + return __loadPPM(file, data, w, h, &channels); +} + +inline bool sdkLoadPPM4ub(const char* file, unsigned char** data, + unsigned int* w, unsigned int* h) { + unsigned char* idata = 0; + unsigned int channels; + + if (__loadPPM(file, &idata, w, h, &channels)) { + // pad 4th component + int size = *w * *h; + // keep the original pointer + unsigned char* idata_orig = idata; + *data = (unsigned char*)malloc(sizeof(unsigned char) * size * 4); + unsigned char* ptr = *data; + + for (int i = 0; i < size; i++) { + *ptr++ = *idata++; + *ptr++ = *idata++; + *ptr++ = *idata++; + *ptr++ = 0; + } + + free(idata_orig); + return true; + } else { + free(idata); + return false; + } +} + +inline bool sdkComparePPM(const char* src_file, const char* ref_file, + const float epsilon, const float threshold, + bool verboseErrors) { + unsigned char *src_data, *ref_data; + uint64_t error_count = 0; + unsigned int ref_width, ref_height; + unsigned int src_width, src_height; + + if (src_file == NULL || ref_file == NULL) { + if (verboseErrors) { + std::cerr << "PPMvsPPM: src_file or ref_file is NULL." + " Aborting comparison\n"; + } + + return false; + } + + if (verboseErrors) { + std::cerr << "> Compare (a)rendered: <" << src_file << ">\n"; + std::cerr << "> (b)reference: <" << ref_file << ">\n"; + } + + if (sdkLoadPPM4ub(ref_file, &ref_data, &ref_width, &ref_height) != true) { + if (verboseErrors) { + std::cerr << "PPMvsPPM: unable to load ref image file: " << ref_file + << "\n"; + } + + return false; + } + + if (sdkLoadPPM4ub(src_file, &src_data, &src_width, &src_height) != true) { + std::cerr << "PPMvsPPM: unable to load src image file: " << src_file + << "\n"; + return false; + } + + if (src_height != ref_height || src_width != ref_width) { + if (verboseErrors) { + std::cerr << "PPMvsPPM: source and ref size mismatch (" << src_width + << "," << src_height << ")vs(" << ref_width << "," << ref_height + << ")\n"; + } + } + + if (verboseErrors) { + std::cerr << "PPMvsPPM: comparing images size (" << src_width << "," + << src_height << ") epsilon(" << epsilon << "), threshold(" + << threshold * 100 << "%)\n"; + } + + if (compareData(ref_data, src_data, src_width * src_height * 4, epsilon, + threshold) == false) + error_count = 1; + + if (error_count == 0) { + if (verboseErrors) + std::cerr << " OK\n\n"; + } else { + if (verboseErrors) + std::cerr << " FAILURE! " << error_count << " errors...\n\n"; + } + + // returns true if all pixels pass + return (error_count == 0) ? true : false; +} + +inline bool sdkComparePGM(const char* src_file, const char* ref_file, + const float epsilon, const float threshold, + bool verboseErrors) { + unsigned char *src_data = 0, *ref_data = 0; + uint64_t error_count = 0; + unsigned int ref_width, ref_height; + unsigned int src_width, src_height; + + if (src_file == NULL || ref_file == NULL) { + if (verboseErrors) { + std::cerr << "PGMvsPGM: src_file or ref_file is NULL." + " Aborting comparison\n"; + } + + return false; + } + + if (verboseErrors) { + std::cerr << "> Compare (a)rendered: <" << src_file << ">\n"; + std::cerr << "> (b)reference: <" << ref_file << ">\n"; + } + + if (sdkLoadPPMub(ref_file, &ref_data, &ref_width, &ref_height) != true) { + if (verboseErrors) { + std::cerr << "PGMvsPGM: unable to load ref image file: " << ref_file + << "\n"; + } + + return false; + } + + if (sdkLoadPPMub(src_file, &src_data, &src_width, &src_height) != true) { + std::cerr << "PGMvsPGM: unable to load src image file: " << src_file + << "\n"; + return false; + } + + if (src_height != ref_height || src_width != ref_width) { + if (verboseErrors) { + std::cerr << "PGMvsPGM: source and ref size mismatch (" << src_width + << "," << src_height << ")vs(" << ref_width << "," << ref_height + << ")\n"; + } + } + + if (verboseErrors) + std::cerr << "PGMvsPGM: comparing images size (" << src_width << "," + << src_height << ") epsilon(" << epsilon << "), threshold(" + << threshold * 100 << "%)\n"; + + if (compareData(ref_data, src_data, src_width * src_height, epsilon, + threshold) == false) + error_count = 1; + + if (error_count == 0) { + if (verboseErrors) + std::cerr << " OK\n\n"; + } else { + if (verboseErrors) + std::cerr << " FAILURE! " << error_count << " errors...\n\n"; + } + + // returns true if all pixels pass + return (error_count == 0) ? true : false; +} + +#endif // COMMON_HELPER_IMAGE_H_ diff --git a/demos/CUDA/BlackScholes/helper/helper_string.h b/demos/CUDA/BlackScholes/helper/helper_string.h new file mode 100644 index 000000000..f6c25f659 --- /dev/null +++ b/demos/CUDA/BlackScholes/helper/helper_string.h @@ -0,0 +1,441 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// These are helper functions for the SDK samples (string parsing, timers, etc) +#ifndef COMMON_HELPER_STRING_H_ +#define COMMON_HELPER_STRING_H_ + +#include +#include +#include +#include + +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) +#ifndef _CRT_SECURE_NO_DEPRECATE +#define _CRT_SECURE_NO_DEPRECATE +#endif +#ifndef STRCASECMP +#define STRCASECMP _stricmp +#endif +#ifndef STRNCASECMP +#define STRNCASECMP _strnicmp +#endif +#ifndef STRCPY +#define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath) +#endif + +#ifndef FOPEN +#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode) +#endif +#ifndef FOPEN_FAIL +#define FOPEN_FAIL(result) (result != 0) +#endif +#ifndef SSCANF +#define SSCANF sscanf_s +#endif +#ifndef SPRINTF +#define SPRINTF sprintf_s +#endif +#else // Linux Includes +#include +#include + +#ifndef STRCASECMP +#define STRCASECMP strcasecmp +#endif +#ifndef STRNCASECMP +#define STRNCASECMP strncasecmp +#endif +#ifndef STRCPY +#define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath) +#endif + +#ifndef FOPEN +#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode)) +#endif +#ifndef FOPEN_FAIL +#define FOPEN_FAIL(result) (result == NULL) +#endif +#ifndef SSCANF +#define SSCANF sscanf +#endif +#ifndef SPRINTF +#define SPRINTF sprintf +#endif +#endif + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +// CUDA Utility Helper Functions +inline int stringRemoveDelimiter(char delimiter, const char* string) { + int string_start = 0; + + while (string[string_start] == delimiter) + string_start++; + + if (string_start >= static_cast(strlen(string) - 1)) + return 0; + + return string_start; +} + +inline int getFileExtension(char* filename, char** extension) { + int string_length = static_cast(strlen(filename)); + + while (filename[string_length--] != '.') + if (string_length == 0) + break; + + if (string_length > 0) + string_length += 2; + + if (string_length == 0) + *extension = NULL; + else + *extension = &filename[string_length]; + + return string_length; +} + +inline bool checkCmdLineFlag(const int argc, const char** argv, + const char* string_ref) { + bool bFound = false; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char* string_argv = &argv[i][string_start]; + + const char* equal_pos = strchr(string_argv, '='); + int argv_length = static_cast( + equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv); + + int length = static_cast(strlen(string_ref)); + + if (length == argv_length && + !STRNCASECMP(string_argv, string_ref, length)) { + bFound = true; + continue; + } + } + } + + return bFound; +} + +// This function wraps the CUDA Driver API into a template function +template +inline bool getCmdLineArgumentValue(const int argc, const char** argv, + const char* string_ref, T* value) { + bool bFound = false; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char* string_argv = &argv[i][string_start]; + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + if (length + 1 <= static_cast(strlen(string_argv))) { + int auto_inc = (string_argv[length] == '=') ? 1 : 0; + *value = (T)atoi(&string_argv[length + auto_inc]); + } + + bFound = true; + i = argc; + } + } + } + + return bFound; +} + +inline int getCmdLineArgumentInt(const int argc, const char** argv, + const char* string_ref) { + bool bFound = false; + int value = -1; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char* string_argv = &argv[i][string_start]; + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + if (length + 1 <= static_cast(strlen(string_argv))) { + int auto_inc = (string_argv[length] == '=') ? 1 : 0; + value = atoi(&string_argv[length + auto_inc]); + } else { + value = 0; + } + + bFound = true; + continue; + } + } + } + + if (bFound) + return value; + else + return 0; +} + +inline float getCmdLineArgumentFloat(const int argc, const char** argv, + const char* string_ref) { + bool bFound = false; + float value = -1; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char* string_argv = &argv[i][string_start]; + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + if (length + 1 <= static_cast(strlen(string_argv))) { + int auto_inc = (string_argv[length] == '=') ? 1 : 0; + value = static_cast(atof(&string_argv[length + auto_inc])); + } else { + value = 0.f; + } + + bFound = true; + continue; + } + } + } + + if (bFound) + return value; + else + return 0; +} + +inline bool getCmdLineArgumentString(const int argc, const char** argv, + const char* string_ref, + char** string_retval) { + bool bFound = false; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + char* string_argv = const_cast(&argv[i][string_start]); + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + *string_retval = &string_argv[length + 1]; + bFound = true; + continue; + } + } + } + + if (!bFound) + *string_retval = NULL; + + return bFound; +} + +////////////////////////////////////////////////////////////////////////////// +//! Find the path for a file assuming that +//! files are found in the searchPath. +//! +//! @return the path if succeeded, otherwise 0 +//! @param filename name of the file +//! @param executable_path optional absolute path of the executable +////////////////////////////////////////////////////////////////////////////// +inline char* sdkFindFilePath(const char* filename, + const char* executable_path) { + // defines a variable that is replaced with the name of the + // executable + + // Typical relative search paths to locate needed companion files (e.g. sample + // input data, or JIT source files) The origin for the relative search may be + // the .exe file, a .bat file launching an .exe, a browser .exe launching the + // .exe or .bat, etc + const char* searchPath[] = { + "./", // same dir + "./data/", // same dir + + "../../../../Samples//", // up 4 in tree + "../../../Samples//", // up 3 in tree + "../../Samples//", // up 2 in tree + + "../../../../Samples//data/", // up 4 in tree + "../../../Samples//data/", // up 3 in tree + "../../Samples//data/", // up 2 in tree + + "../../../../Samples/0_Introduction//", // up 4 in tree + "../../../Samples/0_Introduction//", // up 3 in tree + "../../Samples/0_Introduction//", // up 2 in tree + + "../../../../Samples/1_Utilities//", // up 4 in tree + "../../../Samples/1_Utilities//", // up 3 in tree + "../../Samples/1_Utilities//", // up 2 in tree + + "../../../../Samples/2_Concepts_and_Techniques//", // up + // 4 + // in + // tree + "../../../Samples/2_Concepts_and_Techniques//", // up 3 + // in + // tree + "../../Samples/2_Concepts_and_Techniques//", // up 2 in + // tree + + "../../../../Samples/3_CUDA_Features//", // up 4 in tree + "../../../Samples/3_CUDA_Features//", // up 3 in tree + "../../Samples/3_CUDA_Features//", // up 2 in tree + + "../../../../Samples/4_CUDA_Libraries//", // up 4 in tree + "../../../Samples/4_CUDA_Libraries//", // up 3 in tree + "../../Samples/4_CUDA_Libraries//", // up 2 in tree + + "../../../../Samples/5_Domain_Specific//", // up 4 in + // tree + "../../../Samples/5_Domain_Specific//", // up 3 in tree + "../../Samples/5_Domain_Specific//", // up 2 in tree + + "../../../../Samples/6_Performance//", // up 4 in tree + "../../../Samples/6_Performance//", // up 3 in tree + "../../Samples/6_Performance//", // up 2 in tree + + "../../../../Samples/0_Introduction//data/", // up 4 in + // tree + "../../../Samples/0_Introduction//data/", // up 3 in tree + "../../Samples/0_Introduction//data/", // up 2 in tree + + "../../../../Samples/1_Utilities//data/", // up 4 in tree + "../../../Samples/1_Utilities//data/", // up 3 in tree + "../../Samples/1_Utilities//data/", // up 2 in tree + + "../../../../Samples/2_Concepts_and_Techniques//data/", // up 4 in tree + "../../../Samples/2_Concepts_and_Techniques//data/", // up 3 in tree + "../../Samples/2_Concepts_and_Techniques//data/", // up 2 + // in + // tree + + "../../../../Samples/3_CUDA_Features//data/", // up 4 in + // tree + "../../../Samples/3_CUDA_Features//data/", // up 3 in + // tree + "../../Samples/3_CUDA_Features//data/", // up 2 in tree + + "../../../../Samples/4_CUDA_Libraries//data/", // up 4 in + // tree + "../../../Samples/4_CUDA_Libraries//data/", // up 3 in + // tree + "../../Samples/4_CUDA_Libraries//data/", // up 2 in tree + + "../../../../Samples/5_Domain_Specific//data/", // up 4 + // in + // tree + "../../../Samples/5_Domain_Specific//data/", // up 3 in + // tree + "../../Samples/5_Domain_Specific//data/", // up 2 in tree + + "../../../../Samples/6_Performance//data/", // up 4 in + // tree + "../../../Samples/6_Performance//data/", // up 3 in tree + "../../Samples/6_Performance//data/", // up 2 in tree + + "../../../../Common/data/", // up 4 in tree + "../../../Common/data/", // up 3 in tree + "../../Common/data/" // up 2 in tree + }; + + // Extract the executable name + std::string executable_name; + + if (executable_path != 0) { + executable_name = std::string(executable_path); + +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + // Windows path delimiter + size_t delimiter_pos = executable_name.find_last_of('\\'); + executable_name.erase(0, delimiter_pos + 1); + + if (executable_name.rfind(".exe") != std::string::npos) { + // we strip .exe, only if the .exe is found + executable_name.resize(executable_name.size() - 4); + } + +#else + // Linux & OSX path delimiter + size_t delimiter_pos = executable_name.find_last_of('/'); + executable_name.erase(0, delimiter_pos + 1); +#endif + } + + // Loop over all search paths and return the first hit + for (unsigned int i = 0; i < sizeof(searchPath) / sizeof(char*); ++i) { + std::string path(searchPath[i]); + size_t executable_name_pos = path.find(""); + + // If there is executable_name variable in the searchPath + // replace it with the value + if (executable_name_pos != std::string::npos) { + if (executable_path != 0) { + path.replace(executable_name_pos, strlen(""), + executable_name); + } else { + // Skip this path entry if no executable argument is given + continue; + } + } + +#ifdef _DEBUG + printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str()); +#endif + + // Test if the file exists + path.append(filename); + FILE* fp; + FOPEN(fp, path.c_str(), "rb"); + + if (fp != NULL) { + fclose(fp); + // File found + // returning an allocated array here for backwards compatibility reasons + char* file_path = reinterpret_cast(malloc(path.length() + 1)); + STRCPY(file_path, path.length() + 1, path.c_str()); + return file_path; + } + + if (fp) + fclose(fp); + } + + // File not found + printf("\nerror: sdkFindFilePath: file <%s> not found!\n", filename); + return 0; +} + +#endif // COMMON_HELPER_STRING_H_ diff --git a/demos/CUDA/BlackScholes/helper/helper_timer.h b/demos/CUDA/BlackScholes/helper/helper_timer.h new file mode 100644 index 000000000..3869bf8ca --- /dev/null +++ b/demos/CUDA/BlackScholes/helper/helper_timer.h @@ -0,0 +1,448 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// Helper Timing Functions +#ifndef COMMON_HELPER_TIMER_H_ +#define COMMON_HELPER_TIMER_H_ + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +// includes, system +#include + +// includes, project +#include + +// Definition of the StopWatch Interface, this is used if we don't want to use +// the CUT functions But rather in a self contained class interface +class StopWatchInterface { +public: + StopWatchInterface() {} + virtual ~StopWatchInterface() {} + +public: + //! Start time measurement + virtual void start() = 0; + + //! Stop time measurement + virtual void stop() = 0; + + //! Reset time counters to zero + virtual void reset() = 0; + + //! Time in msec. after start. If the stop watch is still running (i.e. there + //! was no call to stop()) then the elapsed time is returned, otherwise the + //! time between the last start() and stop call is returned + virtual float getTime() = 0; + + //! Mean time to date based on the number of times the stopwatch has been + //! _stopped_ (ie finished sessions) and the current total time + virtual float getAverageTime() = 0; +}; + +////////////////////////////////////////////////////////////////// +// Begin Stopwatch timer class definitions for all OS platforms // +////////////////////////////////////////////////////////////////// +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) +// includes, system +#define WINDOWS_LEAN_AND_MEAN +#include +#undef min +#undef max + +//! Windows specific implementation of StopWatch +class StopWatchWin : public StopWatchInterface { +public: + //! Constructor, default + StopWatchWin() + : start_time(), end_time(), diff_time(0.0f), total_time(0.0f), + running(false), clock_sessions(0), freq(0), freq_set(false) { + if (!freq_set) { + // helper variable + LARGE_INTEGER temp; + + // get the tick frequency from the OS + QueryPerformanceFrequency(reinterpret_cast(&temp)); + + // convert to type in which it is needed + freq = (static_cast(temp.QuadPart)) / 1000.0; + + // rememeber query + freq_set = true; + } + } + + // Destructor + ~StopWatchWin() {} + +public: + //! Start time measurement + inline void start(); + + //! Stop time measurement + inline void stop(); + + //! Reset time counters to zero + inline void reset(); + + //! Time in msec. after start. If the stop watch is still running (i.e. there + //! was no call to stop()) then the elapsed time is returned, otherwise the + //! time between the last start() and stop call is returned + inline float getTime(); + + //! Mean time to date based on the number of times the stopwatch has been + //! _stopped_ (ie finished sessions) and the current total time + inline float getAverageTime(); + +private: + // member variables + + //! Start of measurement + LARGE_INTEGER start_time; + //! End of measurement + LARGE_INTEGER end_time; + + //! Time difference between the last start and stop + float diff_time; + + //! TOTAL time difference between starts and stops + float total_time; + + //! flag if the stop watch is running + bool running; + + //! Number of times clock has been started + //! and stopped to allow averaging + int clock_sessions; + + //! tick frequency + double freq; + + //! flag if the frequency has been set + bool freq_set; +}; + +// functions, inlined + +//////////////////////////////////////////////////////////////////////////////// +//! Start time measurement +//////////////////////////////////////////////////////////////////////////////// +inline void StopWatchWin::start() { + QueryPerformanceCounter(reinterpret_cast(&start_time)); + running = true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Stop time measurement and increment add to the current diff_time summation +//! variable. Also increment the number of times this clock has been run. +//////////////////////////////////////////////////////////////////////////////// +inline void StopWatchWin::stop() { + QueryPerformanceCounter(reinterpret_cast(&end_time)); + diff_time = static_cast(((static_cast(end_time.QuadPart) - + static_cast(start_time.QuadPart)) / + freq)); + + total_time += diff_time; + clock_sessions++; + running = false; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Reset the timer to 0. Does not change the timer running state but does +//! recapture this point in time as the current start time if it is running. +//////////////////////////////////////////////////////////////////////////////// +inline void StopWatchWin::reset() { + diff_time = 0; + total_time = 0; + clock_sessions = 0; + + if (running) + QueryPerformanceCounter(reinterpret_cast(&start_time)); +} + +//////////////////////////////////////////////////////////////////////////////// +//! Time in msec. after start. If the stop watch is still running (i.e. there +//! was no call to stop()) then the elapsed time is returned added to the +//! current diff_time sum, otherwise the current summed time difference alone +//! is returned. +//////////////////////////////////////////////////////////////////////////////// +inline float StopWatchWin::getTime() { + // Return the TOTAL time to date + float retval = total_time; + + if (running) { + LARGE_INTEGER temp; + QueryPerformanceCounter(reinterpret_cast(&temp)); + retval += static_cast(((static_cast(temp.QuadPart) - + static_cast(start_time.QuadPart)) / + freq)); + } + + return retval; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Time in msec. for a single run based on the total number of COMPLETED runs +//! and the total time. +//////////////////////////////////////////////////////////////////////////////// +inline float StopWatchWin::getAverageTime() { + return (clock_sessions > 0) ? (total_time / clock_sessions) : 0.0f; +} +#else +// Declarations for Stopwatch on Linux and Mac OSX +// includes, system +#include +#include + +//! Windows specific implementation of StopWatch +class StopWatchLinux : public StopWatchInterface { +public: + //! Constructor, default + StopWatchLinux() + : start_time(), diff_time(0.0), total_time(0.0), running(false), + clock_sessions(0) {} + + // Destructor + virtual ~StopWatchLinux() {} + +public: + //! Start time measurement + inline void start(); + + //! Stop time measurement + inline void stop(); + + //! Reset time counters to zero + inline void reset(); + + //! Time in msec. after start. If the stop watch is still running (i.e. there + //! was no call to stop()) then the elapsed time is returned, otherwise the + //! time between the last start() and stop call is returned + inline float getTime(); + + //! Mean time to date based on the number of times the stopwatch has been + //! _stopped_ (ie finished sessions) and the current total time + inline float getAverageTime(); + +private: + // helper functions + + //! Get difference between start time and current time + inline float getDiffTime(); + +private: + // member variables + + //! Start of measurement + struct timeval start_time; + + //! Time difference between the last start and stop + float diff_time; + + //! TOTAL time difference between starts and stops + float total_time; + + //! flag if the stop watch is running + bool running; + + //! Number of times clock has been started + //! and stopped to allow averaging + int clock_sessions; +}; + +// functions, inlined + +//////////////////////////////////////////////////////////////////////////////// +//! Start time measurement +//////////////////////////////////////////////////////////////////////////////// +inline void StopWatchLinux::start() { + gettimeofday(&start_time, 0); + running = true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Stop time measurement and increment add to the current diff_time summation +//! variable. Also increment the number of times this clock has been run. +//////////////////////////////////////////////////////////////////////////////// +inline void StopWatchLinux::stop() { + diff_time = getDiffTime(); + total_time += diff_time; + running = false; + clock_sessions++; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Reset the timer to 0. Does not change the timer running state but does +//! recapture this point in time as the current start time if it is running. +//////////////////////////////////////////////////////////////////////////////// +inline void StopWatchLinux::reset() { + diff_time = 0; + total_time = 0; + clock_sessions = 0; + + if (running) + gettimeofday(&start_time, 0); +} + +//////////////////////////////////////////////////////////////////////////////// +//! Time in msec. after start. If the stop watch is still running (i.e. there +//! was no call to stop()) then the elapsed time is returned added to the +//! current diff_time sum, otherwise the current summed time difference alone +//! is returned. +//////////////////////////////////////////////////////////////////////////////// +inline float StopWatchLinux::getTime() { + // Return the TOTAL time to date + float retval = total_time; + + if (running) + retval += getDiffTime(); + + return retval; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Time in msec. for a single run based on the total number of COMPLETED runs +//! and the total time. +//////////////////////////////////////////////////////////////////////////////// +inline float StopWatchLinux::getAverageTime() { + return (clock_sessions > 0) ? (total_time / clock_sessions) : 0.0f; +} +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +inline float StopWatchLinux::getDiffTime() { + struct timeval t_time; + gettimeofday(&t_time, 0); + + // time difference in milli-seconds + return static_cast(1000.0 * (t_time.tv_sec - start_time.tv_sec) + + (0.001 * (t_time.tv_usec - start_time.tv_usec))); +} +#endif // WIN32 + +//////////////////////////////////////////////////////////////////////////////// +//! Timer functionality exported + +//////////////////////////////////////////////////////////////////////////////// +//! Create a new timer +//! @return true if a time has been created, otherwise false +//! @param name of the new timer, 0 if the creation failed +//////////////////////////////////////////////////////////////////////////////// +inline bool sdkCreateTimer(StopWatchInterface** timer_interface) { +// printf("sdkCreateTimer called object %08x\n", (void *)*timer_interface); +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + *timer_interface = reinterpret_cast(new StopWatchWin()); +#else + *timer_interface = + reinterpret_cast(new StopWatchLinux()); +#endif + return (*timer_interface != NULL) ? true : false; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Delete a timer +//! @return true if a time has been deleted, otherwise false +//! @param name of the timer to delete +//////////////////////////////////////////////////////////////////////////////// +inline bool sdkDeleteTimer(StopWatchInterface** timer_interface) { + // printf("sdkDeleteTimer called object %08x\n", (void *)*timer_interface); + if (*timer_interface) { + delete *timer_interface; + *timer_interface = NULL; + } + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Start the time with name \a name +//! @param name name of the timer to start +//////////////////////////////////////////////////////////////////////////////// +inline bool sdkStartTimer(StopWatchInterface** timer_interface) { + // printf("sdkStartTimer called object %08x\n", (void *)*timer_interface); + if (*timer_interface) + (*timer_interface)->start(); + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Stop the time with name \a name. Does not reset. +//! @param name name of the timer to stop +//////////////////////////////////////////////////////////////////////////////// +inline bool sdkStopTimer(StopWatchInterface** timer_interface) { + // printf("sdkStopTimer called object %08x\n", (void *)*timer_interface); + if (*timer_interface) + (*timer_interface)->stop(); + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Resets the timer's counter. +//! @param name name of the timer to reset. +//////////////////////////////////////////////////////////////////////////////// +inline bool sdkResetTimer(StopWatchInterface** timer_interface) { + // printf("sdkResetTimer called object %08x\n", (void *)*timer_interface); + if (*timer_interface) + (*timer_interface)->reset(); + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Return the average time for timer execution as the total time +//! for the timer dividied by the number of completed (stopped) runs the timer +//! has made. +//! Excludes the current running time if the timer is currently running. +//! @param name name of the timer to return the time of +//////////////////////////////////////////////////////////////////////////////// +inline float sdkGetAverageTimerValue(StopWatchInterface** timer_interface) { + // printf("sdkGetAverageTimerValue called object %08x\n", (void + // *)*timer_interface); + if (*timer_interface) + return (*timer_interface)->getAverageTime(); + else + return 0.0f; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Total execution time for the timer over all runs since the last reset +//! or timer creation. +//! @param name name of the timer to obtain the value of. +//////////////////////////////////////////////////////////////////////////////// +inline float sdkGetTimerValue(StopWatchInterface** timer_interface) { + // printf("sdkGetTimerValue called object %08x\n", (void *)*timer_interface); + if (*timer_interface) + return (*timer_interface)->getTime(); + else + return 0.0f; +} + +#endif // COMMON_HELPER_TIMER_H_ diff --git a/include/clad/Differentiator/BuiltinDerivatives.h b/include/clad/Differentiator/BuiltinDerivatives.h index d309ababa..bb572cf7a 100644 --- a/include/clad/Differentiator/BuiltinDerivatives.h +++ b/include/clad/Differentiator/BuiltinDerivatives.h @@ -396,6 +396,28 @@ inline void free_pushforward(void* ptr, void* d_ptr) { // NOLINTEND(cppcoreguidelines-owning-memory) // NOLINTEND(cppcoreguidelines-no-malloc) +CUDA_HOST_DEVICE inline void expf_pullback(float a, float d_y, float* d_a) { + *d_a += expf(a) * d_y; +} + +CUDA_HOST_DEVICE inline void fabsf_pullback(float a, float d_y, float* d_a) { + *d_a += (a >= 0) ? d_y : -d_y; +} + +CUDA_HOST_DEVICE inline void logf_pullback(float a, float d_y, float* d_a) { + *d_a += (1.F / a) * d_y; +} + +CUDA_HOST_DEVICE inline void fdividef_pullback(float a, float b, float d_y, + float* d_a, float* d_b) { + *d_a += (1.F / b) * d_y; + *d_b += (-a / (b * b)) * d_y; +} + +CUDA_HOST_DEVICE inline void sqrtf_pullback(float a, float d_y, float* d_a) { + *d_a += (1.F / (2.F * sqrtf(a))) * d_y; +} + // These are required because C variants of mathematical functions are // defined in global namespace. using std::abs_pushforward; From e45932b53857696c450d42a5ad1eba141ff9b715 Mon Sep 17 00:00:00 2001 From: kchristin Date: Fri, 15 Nov 2024 14:21:48 +0200 Subject: [PATCH 02/18] Improve BlackScholes demo's clarity --- demos/CUDA/BlackScholes/BlackScholes.cu | 248 +++++++----------- demos/CUDA/BlackScholes/BlackScholes_gold.cpp | 6 +- .../CUDA/BlackScholes/BlackScholes_kernel.cuh | 6 + 3 files changed, 109 insertions(+), 151 deletions(-) diff --git a/demos/CUDA/BlackScholes/BlackScholes.cu b/demos/CUDA/BlackScholes/BlackScholes.cu index 3c3617b44..25a137a08 100644 --- a/demos/CUDA/BlackScholes/BlackScholes.cu +++ b/demos/CUDA/BlackScholes/BlackScholes.cu @@ -32,17 +32,17 @@ */ /* - * DISCLAIMER: The following file has been modified slightly to make it - * compatible with Clad. The original file can be found at NVIDIA's cuda-samples - * repository at GitHub. + * DISCLAIMER: The following file has been slightly modified to ensure + * compatibility with Clad and serve the purpose of a Clad demo. The original + * file is available in NVIDIA's cuda-samples repository on GitHub. * - * Relevant documentation regarding the problem at hand can be found at NVIDIA's - * cuda-samples repository. With the use of Clad, we compute some of the Greeks - * (sensitivities) for Black-Scholes and verify them using the - * theoretical values as denoted in Wikipedia + * Relevant documentation regarding the problem at hand can be found in NVIDIA's + * cuda-samples repository. Using Clad, we compute some of the Greeks + * (sensitivities) for the Black-Scholes model and verify them against + * approximations of their theoretical values as denoted in Wikipedia * (https://en.wikipedia.org/wiki/Black%E2%80%93Scholes_model). * - * To build and run the demo, run the following command: make run + * To build and run the demo, use the following command: make run */ #include "clad/Differentiator/Differentiator.h" @@ -148,93 +148,86 @@ double N_prime(double d) { enum Greek { Delta, dX, Theta }; -double computeL1norm_Call(float* S, float* X, float* T, float* d, Greek greek) { - double delta, ref, sum_delta, sum_ref; - sum_delta = 0; - sum_ref = 0; - switch (greek) { - case Delta: - for (int i = 0; i < OPT_N; i++) { - double d1_val = d1(S[i], X[i], T[i]); - ref = CND(d1_val); - delta = fabs(d[i] - ref); - sum_delta += delta; - sum_ref += fabs(ref); - } - break; - case dX: - for (int i = 0; i < OPT_N; i++) { - double T_val = T[i]; - double d1_val = d1(S[i], X[i], T_val); - double d2_val = d1_val - VOLATILITY * sqrt(T_val); - double expRT = exp(-RISKFREE * T_val); - ref = -expRT * CND(d2_val); - delta = fabs(d[i] - ref); - sum_delta += delta; - sum_ref += fabs(ref); - } - break; - case Theta: - for (int i = 0; i < OPT_N; i++) { - double S_val = S[i], X_val = X[i], T_val = T[i]; - double d1_val = d1(S_val, X_val, T_val); - double d2_val = d1_val - VOLATILITY * sqrt(T_val); - double expRT = exp(-RISKFREE * T_val); - ref = - (S_val * N_prime(d1_val) * VOLATILITY) / (2 * sqrt(T_val)) + - RISKFREE * X_val * expRT * - CND(d2_val); // theta is with respect to t, so -theta is the - // approximation of the derivative with respect to T - delta = fabs(d[i] - ref); - sum_delta += delta; - sum_ref += fabs(ref); - } - } +enum OptionType { Call, Put }; + +template const char* getNameofOpt() { + if constexpr (opt == Call) + return "Call"; + if constexpr (opt == Put) + return "Put"; +} - return sum_delta / sum_ref; +template const char* getNameOfGreek() { + if constexpr (greek == Delta) + return "Delta"; + if constexpr (greek == dX) + return "dStrike"; + if constexpr (greek == Theta) + return "Theta"; } -double computeL1norm_Put(float* S, float* X, float* T, float* d, Greek greek) { +template +void computeL1norm(float* S, float* X, float* T, float* d) { double delta, ref, sum_delta, sum_ref; sum_delta = 0; sum_ref = 0; - switch (greek) { - case Delta: - for (int i = 0; i < OPT_N; i++) { - double d1_val = d1(S[i], X[i], T[i]); - ref = CND(d1_val) - 1.0; - delta = fabs(d[i] - ref); - sum_delta += delta; - sum_ref += fabs(ref); - } - break; - case dX: - for (int i = 0; i < OPT_N; i++) { - double T_val = T[i]; - double d1_val = d1(S[i], X[i], T_val); - double d2_val = d1_val - VOLATILITY * sqrt(T_val); - double expRT = exp(-RISKFREE * T_val); - ref = expRT * CND(-d2_val); - delta = fabs(d[i] - ref); - sum_delta += delta; - sum_ref += fabs(ref); - } - break; - case Theta: - for (int i = 0; i < OPT_N; i++) { - double S_val = S[i], X_val = X[i], T_val = T[i]; - double d1_val = d1(S_val, X_val, T_val); - double d2_val = d1_val - VOLATILITY * sqrt(T_val); - double expRT = exp(-RISKFREE * T_val); - ref = (S_val * N_prime(d1_val) * VOLATILITY) / (2 * sqrt(T_val)) - - RISKFREE * X_val * expRT * CND(-d2_val); - delta = fabs(d[i] - ref); - sum_delta += delta; - sum_ref += fabs(ref); + for (int i = 0; i < OPT_N; i++) { + if constexpr (opt == Call) { + if constexpr (greek == Delta) { + double d1_val = d1(S[i], X[i], T[i]); + ref = CND(d1_val); + } else if constexpr (greek == dX) { + double T_val = T[i]; + double d1_val = d1(S[i], X[i], T_val); + double d2_val = d1_val - VOLATILITY * sqrt(T_val); + double expRT = exp(-RISKFREE * T_val); + ref = -expRT * CND(d2_val); + } else if constexpr (greek == Theta) { + double S_val = S[i], X_val = X[i], T_val = T[i]; + double d1_val = d1(S_val, X_val, T_val); + double d2_val = d1_val - VOLATILITY * sqrt(T_val); + double expRT = exp(-RISKFREE * T_val); + ref = (S_val * N_prime(d1_val) * VOLATILITY) / (2 * sqrt(T_val)) + + RISKFREE * X_val * expRT * + CND(d2_val); // theta is with respect to t, so -theta is the + // approximation of the derivative with respect + // to T + } + } else if constexpr (opt == Put) { + if constexpr (greek == Delta) { + double d1_val = d1(S[i], X[i], T[i]); + ref = CND(d1_val) - 1.0; + } else if constexpr (greek == dX) { + double T_val = T[i]; + double d1_val = d1(S[i], X[i], T_val); + double d2_val = d1_val - VOLATILITY * sqrt(T_val); + double expRT = exp(-RISKFREE * T_val); + ref = expRT * CND(-d2_val); + } else if constexpr (greek == Theta) { + double S_val = S[i], X_val = X[i], T_val = T[i]; + double d1_val = d1(S_val, X_val, T_val); + double d2_val = d1_val - VOLATILITY * sqrt(T_val); + double expRT = exp(-RISKFREE * T_val); + ref = (S_val * N_prime(d1_val) * VOLATILITY) / (2 * sqrt(T_val)) - + RISKFREE * X_val * expRT * CND(-d2_val); + } } + + delta = fabs(d[i] - ref); + sum_delta += delta; + sum_ref += fabs(ref); } - return sum_delta / sum_ref; + double L1norm = sum_delta / sum_ref; + printf("L1norm of %s for %s option = %E\n", getNameOfGreek(), + getNameofOpt(), L1norm); + if (L1norm > 1e-5) { + printf( + "Gradient test failed: Difference between %s's computed and " + "approximated theoretical values for %s option is larger than expected", + getNameOfGreek(), getNameofOpt()); + exit(EXIT_FAILURE); + } } int main(int argc, char** argv) { @@ -257,6 +250,8 @@ int main(int argc, char** argv) { h_OptionYears[i] = RandFloat(0.25f, 10.0f); } + /*******************************************************************************/ + // Compute gradients auto callGrad = clad::gradient( launch, "h_CallResultGPU, h_StockPrice, h_OptionStrike, h_OptionYears"); @@ -275,9 +270,9 @@ int main(int argc, char** argv) { d_PutResultGPU[i] = 1.0f; } - // Launch the kernel and the gradient + /*******************************************************************************/ - // Compute the derivatives of the price of the call options + // Compute the values and derivatives of the price of the call options callGrad.execute(h_CallResultCPU, h_CallResultGPU, h_PutResultCPU, h_PutResultGPU, h_StockPrice, h_OptionStrike, h_OptionYears, d_CallResultGPU, d_StockPrice, d_OptionStrike, @@ -304,40 +299,18 @@ int main(int argc, char** argv) { } // Verify delta - L1norm = computeL1norm_Call(h_StockPrice, h_OptionStrike, h_OptionYears, - d_StockPrice, Delta); - printf("L1norm of delta for Call option = %E\n", L1norm); - if (L1norm > 1e-5) { - printf("Gradient test failed: the difference between the computed and the " - "approximated theoretical delta for Call option is larger than " - "expected\n"); - return EXIT_FAILURE; - } - + computeL1norm(h_StockPrice, h_OptionStrike, h_OptionYears, + d_StockPrice); // Verify derivatives with respect to the Strike price - L1norm = computeL1norm_Call(h_StockPrice, h_OptionStrike, h_OptionYears, - d_OptionStrike, dX); - printf("L1norm of derivative of Call w.r.t. the strike price = %E\n", L1norm); - if (L1norm > 1e-5) { - printf( - "Gradient test failed: the difference between the computed and the " - "approximated theoretical derivative of Call w.r.t. the strike price " - "is larger than expected\n"); - return EXIT_FAILURE; - } - + computeL1norm(h_StockPrice, h_OptionStrike, h_OptionYears, + d_OptionStrike); // Verify theta - L1norm = computeL1norm_Call(h_StockPrice, h_OptionStrike, h_OptionYears, - d_OptionYears, Theta); - printf("L1norm of theta for Call option = %E\n", L1norm); - if (L1norm > 1e-5) { - printf("Gradient test failed: the difference between the computed and the " - "approximated theoretical theta for Call option is larger than " - "expected\n"); - return EXIT_FAILURE; - } + computeL1norm(h_StockPrice, h_OptionStrike, h_OptionYears, + d_OptionYears); - // Compute the derivatives of the price of the Put options + /*******************************************************************************/ + + // Re-initialize data for next gradient call for (int i = 0; i < OPT_N; i++) { h_CallResultCPU[i] = 0.0f; h_PutResultCPU[i] = -1.0f; @@ -351,43 +324,22 @@ int main(int argc, char** argv) { d_OptionYears[i] = 0.f; } + // Compute the values and derivatives of the price of the Put options putGrad.execute(h_CallResultCPU, h_CallResultGPU, h_PutResultCPU, h_PutResultGPU, h_StockPrice, h_OptionStrike, h_OptionYears, d_PutResultGPU, d_StockPrice, d_OptionStrike, d_OptionYears); // Verify delta - L1norm = computeL1norm_Put(h_StockPrice, h_OptionStrike, h_OptionYears, - d_StockPrice, Delta); - printf("L1norm of delta for Put option = %E\n", L1norm); - if (L1norm > 1e-5) { - printf("Gradient test failed: the difference between the computed and " - "the approximated theoretical delta for Put option is larger than " - "expected\n"); - return EXIT_FAILURE; - } - + computeL1norm(h_StockPrice, h_OptionStrike, h_OptionYears, + d_StockPrice); // Verify derivatives with respect to the Strike price - L1norm = computeL1norm_Put(h_StockPrice, h_OptionStrike, h_OptionYears, - d_OptionStrike, dX); - printf("L1norm of derivative of Put w.r.t. the strike price = %E\n", L1norm); - if (L1norm > 1e-6) { - printf("Gradient test failed: the difference between the computed and the " - "approximated theoretcial derivative of " - "Put w.r.t. the strike price is larger than expected\n"); - return EXIT_FAILURE; - } - + computeL1norm(h_StockPrice, h_OptionStrike, h_OptionYears, + d_OptionStrike); // Verify theta - L1norm = computeL1norm_Put(h_StockPrice, h_OptionStrike, h_OptionYears, - d_OptionYears, Theta); - printf("L1norm of theta for Put option = %E\n", L1norm); - if (L1norm > 1e-5) { - printf("Gradient test failed: the difference between the computed and the " - "approximated theoretical theta for Put option is larger than " - "expected\n"); - return EXIT_FAILURE; - } + computeL1norm(h_StockPrice, h_OptionStrike, h_OptionYears, + d_OptionYears); + /*******************************************************************************/ free(h_OptionYears); free(h_OptionStrike); free(h_StockPrice); diff --git a/demos/CUDA/BlackScholes/BlackScholes_gold.cpp b/demos/CUDA/BlackScholes/BlackScholes_gold.cpp index 6b0d9c8ef..d61003f1f 100644 --- a/demos/CUDA/BlackScholes/BlackScholes_gold.cpp +++ b/demos/CUDA/BlackScholes/BlackScholes_gold.cpp @@ -26,9 +26,9 @@ */ /* - * DISCLAIMER: The following file has been modified slightly to make it - * compatible with Clad. The original file can be found at NVIDIA's cuda-samples - * repository at GitHub. + * DISCLAIMER: The following file has been slightly modified to ensure + * compatibility with Clad. The original file is available in NVIDIA's + * cuda-samples repository on GitHub. */ #include diff --git a/demos/CUDA/BlackScholes/BlackScholes_kernel.cuh b/demos/CUDA/BlackScholes/BlackScholes_kernel.cuh index 26497b8ac..78d032212 100644 --- a/demos/CUDA/BlackScholes/BlackScholes_kernel.cuh +++ b/demos/CUDA/BlackScholes/BlackScholes_kernel.cuh @@ -25,6 +25,12 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +/* + * DISCLAIMER: The following file has been slightly modified to ensure + * compatibility with Clad. The original file is available in NVIDIA's + * cuda-samples repository on GitHub. + */ + //////////////////////////////////////////////////////////////////////////////// // Polynomial approximation of cumulative normal distribution function //////////////////////////////////////////////////////////////////////////////// From 6f4379f5685e9833807c895b74d51823badd4d07 Mon Sep 17 00:00:00 2001 From: kchristin Date: Fri, 15 Nov 2024 14:47:52 +0200 Subject: [PATCH 03/18] Clarify the changes made to the original file --- demos/CUDA/BlackScholes/BlackScholes.cu | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/demos/CUDA/BlackScholes/BlackScholes.cu b/demos/CUDA/BlackScholes/BlackScholes.cu index 25a137a08..83311b489 100644 --- a/demos/CUDA/BlackScholes/BlackScholes.cu +++ b/demos/CUDA/BlackScholes/BlackScholes.cu @@ -33,8 +33,12 @@ /* * DISCLAIMER: The following file has been slightly modified to ensure - * compatibility with Clad and serve the purpose of a Clad demo. The original - * file is available in NVIDIA's cuda-samples repository on GitHub. + * compatibility with Clad and to serve as a Clad demo. Specifically, parts of + * the original `main` function have been moved to a separate function to use + * `clad::gradient` on. Furthermore, original print statements have been removed + * and new helper functions are now included in the file to verify the + * gradient's results. The original file is available in NVIDIA's cuda-samples + * repository on GitHub. * * Relevant documentation regarding the problem at hand can be found in NVIDIA's * cuda-samples repository. Using Clad, we compute some of the Greeks From 4a3140d01586810ed6ed26e97d19d318b0aef621 Mon Sep 17 00:00:00 2001 From: kchristin Date: Sat, 16 Nov 2024 18:19:19 +0200 Subject: [PATCH 04/18] Improve sqrtT assignment --- demos/CUDA/BlackScholes/BlackScholes_kernel.cuh | 10 ++++------ include/clad/Differentiator/BuiltinDerivatives.h | 9 +++++++++ 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/demos/CUDA/BlackScholes/BlackScholes_kernel.cuh b/demos/CUDA/BlackScholes/BlackScholes_kernel.cuh index 78d032212..bba537301 100644 --- a/demos/CUDA/BlackScholes/BlackScholes_kernel.cuh +++ b/demos/CUDA/BlackScholes/BlackScholes_kernel.cuh @@ -66,7 +66,7 @@ __device__ inline void BlackScholesBodyGPU(float& CallResult, float& PutResult, float sqrtT, expRT; float d1, d2, CNDD1, CNDD2; - sqrtT = fdividef(1.0F, 1.0 / sqrtf(T)); + sqrtT = sqrtf(T); d1 = fdividef(logf(S / X) + (R + 0.5f * V * V) * T, V * sqrtT); d2 = d1 - V * sqrtT; @@ -106,9 +106,7 @@ __global__ void BlackScholesGPU(float2* __restrict d_CallResult, BlackScholesBodyGPU(callResult2, putResult2, d_StockPrice[opt].y, d_OptionStrike[opt].y, d_OptionYears[opt].y, Riskfree, Volatility); - d_CallResult[opt].x = callResult1; - d_CallResult[opt].y = callResult2; - d_PutResult[opt].x = putResult1; - d_PutResult[opt].y = putResult2; + d_CallResult[opt] = make_float2(callResult1, callResult2); + d_PutResult[opt] = make_float2(putResult1, putResult2); } -} +} \ No newline at end of file diff --git a/include/clad/Differentiator/BuiltinDerivatives.h b/include/clad/Differentiator/BuiltinDerivatives.h index bb572cf7a..adbbb40e9 100644 --- a/include/clad/Differentiator/BuiltinDerivatives.h +++ b/include/clad/Differentiator/BuiltinDerivatives.h @@ -418,6 +418,15 @@ CUDA_HOST_DEVICE inline void sqrtf_pullback(float a, float d_y, float* d_a) { *d_a += (1.F / (2.F * sqrtf(a))) * d_y; } + +#ifdef __CUDACC__ +CUDA_HOST_DEVICE inline void make_float2_pullback(float a, float b, float2 d_y, + float* d_a, float* d_b) { + *d_a += d_y.x; + *d_b += d_y.y; +} +#endif + // These are required because C variants of mathematical functions are // defined in global namespace. using std::abs_pushforward; From 363b7d3d3aed14beb22c5f9a5d7d226f48500107 Mon Sep 17 00:00:00 2001 From: kchristin Date: Sun, 17 Nov 2024 14:51:19 +0200 Subject: [PATCH 05/18] Add launch_bounds attr to kernel --- demos/CUDA/BlackScholes/BlackScholes_kernel.cuh | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/demos/CUDA/BlackScholes/BlackScholes_kernel.cuh b/demos/CUDA/BlackScholes/BlackScholes_kernel.cuh index bba537301..1db634b4c 100644 --- a/demos/CUDA/BlackScholes/BlackScholes_kernel.cuh +++ b/demos/CUDA/BlackScholes/BlackScholes_kernel.cuh @@ -82,12 +82,13 @@ __device__ inline void BlackScholesBodyGPU(float& CallResult, float& PutResult, //////////////////////////////////////////////////////////////////////////////// // Process an array of optN options on GPU //////////////////////////////////////////////////////////////////////////////// -__global__ void BlackScholesGPU(float2* __restrict d_CallResult, - float2* __restrict d_PutResult, - float2* __restrict d_StockPrice, - float2* __restrict d_OptionStrike, - float2* __restrict d_OptionYears, - float Riskfree, float Volatility, int optN) { +__launch_bounds__(128) __global__ + void BlackScholesGPU(float2* __restrict d_CallResult, + float2* __restrict d_PutResult, + float2* __restrict d_StockPrice, + float2* __restrict d_OptionStrike, + float2* __restrict d_OptionYears, float Riskfree, + float Volatility, int optN) { ////Thread index // const int tid = blockDim.x * blockIdx.x + threadIdx.x; ////Total number of threads in execution grid From 5cd5dc48331a62cc8a987599f931e1ab6e19fa93 Mon Sep 17 00:00:00 2001 From: kchristin Date: Sun, 17 Nov 2024 14:55:03 +0200 Subject: [PATCH 06/18] Add specification for change made in CPU code --- demos/CUDA/BlackScholes/BlackScholes_gold.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/demos/CUDA/BlackScholes/BlackScholes_gold.cpp b/demos/CUDA/BlackScholes/BlackScholes_gold.cpp index d61003f1f..219ee9865 100644 --- a/demos/CUDA/BlackScholes/BlackScholes_gold.cpp +++ b/demos/CUDA/BlackScholes/BlackScholes_gold.cpp @@ -27,8 +27,10 @@ /* * DISCLAIMER: The following file has been slightly modified to ensure - * compatibility with Clad. The original file is available in NVIDIA's - * cuda-samples repository on GitHub. + * compatibility with Clad. Specifically, the only change made was removing the + * static property of the `CND` function so it can be called in the main file to + * verify Clad's results. The original file is available for comparison in + * NVIDIA's cuda-samples repository on GitHub. */ #include From 10a4d27f7bcf835a5746e8dccb3b8533cd4c261b Mon Sep 17 00:00:00 2001 From: kchristin Date: Sun, 17 Nov 2024 16:16:29 +0200 Subject: [PATCH 07/18] Add CUDA builtins in different file and keep BlackScholes.cuh same as original --- .../CUDA/BlackScholes/BlackScholes_kernel.cuh | 16 +++------- .../clad/Differentiator/BuiltinDerivatives.h | 23 ------------- .../Differentiator/BuiltinDerivativesCUDA.cuh | 32 +++++++++++++++++++ include/clad/Differentiator/Differentiator.h | 3 ++ 4 files changed, 40 insertions(+), 34 deletions(-) create mode 100644 include/clad/Differentiator/BuiltinDerivativesCUDA.cuh diff --git a/demos/CUDA/BlackScholes/BlackScholes_kernel.cuh b/demos/CUDA/BlackScholes/BlackScholes_kernel.cuh index 1db634b4c..aed2ec643 100644 --- a/demos/CUDA/BlackScholes/BlackScholes_kernel.cuh +++ b/demos/CUDA/BlackScholes/BlackScholes_kernel.cuh @@ -25,12 +25,6 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -/* - * DISCLAIMER: The following file has been slightly modified to ensure - * compatibility with Clad. The original file is available in NVIDIA's - * cuda-samples repository on GitHub. - */ - //////////////////////////////////////////////////////////////////////////////// // Polynomial approximation of cumulative normal distribution function //////////////////////////////////////////////////////////////////////////////// @@ -42,9 +36,9 @@ __device__ inline float cndGPU(float d) { const float A5 = 1.330274429f; const float RSQRT2PI = 0.39894228040143267793994605993438f; - float K = fdividef(1.0f, (1.0f + 0.2316419f * fabsf(d))); + float K = __fdividef(1.0f, (1.0f + 0.2316419f * fabsf(d))); - float cnd = RSQRT2PI * expf(-0.5f * d * d) * + float cnd = RSQRT2PI * __expf(-0.5f * d * d) * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))); if (d > 0) @@ -66,15 +60,15 @@ __device__ inline void BlackScholesBodyGPU(float& CallResult, float& PutResult, float sqrtT, expRT; float d1, d2, CNDD1, CNDD2; - sqrtT = sqrtf(T); - d1 = fdividef(logf(S / X) + (R + 0.5f * V * V) * T, V * sqrtT); + sqrtT = __fdividef(1.0F, rsqrtf(T)); + d1 = __fdividef(__logf(S / X) + (R + 0.5f * V * V) * T, V * sqrtT); d2 = d1 - V * sqrtT; CNDD1 = cndGPU(d1); CNDD2 = cndGPU(d2); // Calculate Call and Put simultaneously - expRT = expf(-R * T); + expRT = __expf(-R * T); CallResult = S * CNDD1 - X * expRT * CNDD2; PutResult = X * expRT * (1.0f - CNDD2) - S * (1.0f - CNDD1); } diff --git a/include/clad/Differentiator/BuiltinDerivatives.h b/include/clad/Differentiator/BuiltinDerivatives.h index adbbb40e9..0221dbe28 100644 --- a/include/clad/Differentiator/BuiltinDerivatives.h +++ b/include/clad/Differentiator/BuiltinDerivatives.h @@ -396,37 +396,14 @@ inline void free_pushforward(void* ptr, void* d_ptr) { // NOLINTEND(cppcoreguidelines-owning-memory) // NOLINTEND(cppcoreguidelines-no-malloc) -CUDA_HOST_DEVICE inline void expf_pullback(float a, float d_y, float* d_a) { - *d_a += expf(a) * d_y; -} - CUDA_HOST_DEVICE inline void fabsf_pullback(float a, float d_y, float* d_a) { *d_a += (a >= 0) ? d_y : -d_y; } -CUDA_HOST_DEVICE inline void logf_pullback(float a, float d_y, float* d_a) { - *d_a += (1.F / a) * d_y; -} - -CUDA_HOST_DEVICE inline void fdividef_pullback(float a, float b, float d_y, - float* d_a, float* d_b) { - *d_a += (1.F / b) * d_y; - *d_b += (-a / (b * b)) * d_y; -} - CUDA_HOST_DEVICE inline void sqrtf_pullback(float a, float d_y, float* d_a) { *d_a += (1.F / (2.F * sqrtf(a))) * d_y; } - -#ifdef __CUDACC__ -CUDA_HOST_DEVICE inline void make_float2_pullback(float a, float b, float2 d_y, - float* d_a, float* d_b) { - *d_a += d_y.x; - *d_b += d_y.y; -} -#endif - // These are required because C variants of mathematical functions are // defined in global namespace. using std::abs_pushforward; diff --git a/include/clad/Differentiator/BuiltinDerivativesCUDA.cuh b/include/clad/Differentiator/BuiltinDerivativesCUDA.cuh new file mode 100644 index 000000000..9179a7856 --- /dev/null +++ b/include/clad/Differentiator/BuiltinDerivativesCUDA.cuh @@ -0,0 +1,32 @@ +#include "clad/Differentiator/CladConfig.h" + +namespace clad { + +namespace custom_derivatives { + +__device__ inline void __expf_pullback(float a, float d_y, float* d_a) { + *d_a += expf(a) * d_y; +} + +__device__ inline void __logf_pullback(float a, float d_y, float* d_a) { + *d_a += (1.F / a) * d_y; +} + +__device__ inline void __fdividef_pullback(float a, float b, float d_y, + float* d_a, float* d_b) { + *d_a += (1.F / b) * d_y; + *d_b += (-a / (b * b)) * d_y; +} + +__device__ inline void rsqrtf_pullback(float a, float d_y, float* d_a) { + // Compute the gradient of rsqrt with respect to x + *d_a = d_y * (-0.5 * powf(a, -1.5)); +} + +__device__ inline void make_float2_pullback(float a, float b, float2 d_y, + float* d_a, float* d_b) { + *d_a += d_y.x; + *d_b += d_y.y; +} +} +} diff --git a/include/clad/Differentiator/Differentiator.h b/include/clad/Differentiator/Differentiator.h index 1ae60d961..c8aaaa286 100644 --- a/include/clad/Differentiator/Differentiator.h +++ b/include/clad/Differentiator/Differentiator.h @@ -10,6 +10,9 @@ #include "Array.h" #include "ArrayRef.h" #include "BuiltinDerivatives.h" +#ifdef __CUDACC__ +#include "BuiltinDerivativesCUDA.cuh" +#endif #include "CladConfig.h" #include "DynamicGraph.h" #include "FunctionTraits.h" From 7acf1b24e8d1c58422d247403b3e37b26ab9da7f Mon Sep 17 00:00:00 2001 From: kchristin Date: Sun, 17 Nov 2024 16:31:33 +0200 Subject: [PATCH 08/18] Fix format --- include/clad/Differentiator/BuiltinDerivativesCUDA.cuh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/clad/Differentiator/BuiltinDerivativesCUDA.cuh b/include/clad/Differentiator/BuiltinDerivativesCUDA.cuh index 9179a7856..ef5ec9df8 100644 --- a/include/clad/Differentiator/BuiltinDerivativesCUDA.cuh +++ b/include/clad/Differentiator/BuiltinDerivativesCUDA.cuh @@ -13,7 +13,7 @@ __device__ inline void __logf_pullback(float a, float d_y, float* d_a) { } __device__ inline void __fdividef_pullback(float a, float b, float d_y, - float* d_a, float* d_b) { + float* d_a, float* d_b) { *d_a += (1.F / b) * d_y; *d_b += (-a / (b * b)) * d_y; } @@ -24,9 +24,9 @@ __device__ inline void rsqrtf_pullback(float a, float d_y, float* d_a) { } __device__ inline void make_float2_pullback(float a, float b, float2 d_y, - float* d_a, float* d_b) { + float* d_a, float* d_b) { *d_a += d_y.x; *d_b += d_y.y; } -} -} +} // namespace custom_derivatives +} // namespace clad From b516dbbbe5222d525352402f6c916117616024da Mon Sep 17 00:00:00 2001 From: kchristin Date: Wed, 20 Nov 2024 19:20:10 +0200 Subject: [PATCH 09/18] Add most of the original print statements of the demo --- demos/CUDA/BlackScholes/BlackScholes.cu | 99 ++++++++++++++++++++----- 1 file changed, 82 insertions(+), 17 deletions(-) diff --git a/demos/CUDA/BlackScholes/BlackScholes.cu b/demos/CUDA/BlackScholes/BlackScholes.cu index 83311b489..d3fda70ec 100644 --- a/demos/CUDA/BlackScholes/BlackScholes.cu +++ b/demos/CUDA/BlackScholes/BlackScholes.cu @@ -235,18 +235,43 @@ void computeL1norm(float* S, float* X, float* T, float* d) { } int main(int argc, char** argv) { - float* h_CallResultCPU = (float*)malloc(OPT_SZ); - float* h_PutResultCPU = (float*)malloc(OPT_SZ); - float* h_CallResultGPU = (float*)malloc(OPT_SZ); - float* h_PutResultGPU = (float*)malloc(OPT_SZ); - float* h_StockPrice = (float*)malloc(OPT_SZ); - float* h_OptionStrike = (float*)malloc(OPT_SZ); - float* h_OptionYears = (float*)malloc(OPT_SZ); + // Start logs + printf("[%s] - Starting...\n", argv[0]); + //'h_' prefix - CPU (host) memory space + float + // Results calculated by CPU for reference + *h_CallResultCPU, + *h_PutResultCPU, + // CPU copy of GPU results + *h_CallResultGPU, *h_PutResultGPU, + // CPU instance of input data + *h_StockPrice, *h_OptionStrike, *h_OptionYears; + + double delta, ref, sum_delta, sum_ref, max_delta, L1norm, gpuTime; + + StopWatchInterface* hTimer = NULL; + int i; + + findCudaDevice(argc, (const char**)argv); + + sdkCreateTimer(&hTimer); + + printf("Initializing data...\n"); + printf("...allocating CPU memory for options.\n"); + h_CallResultCPU = (float*)malloc(OPT_SZ); + h_PutResultCPU = (float*)malloc(OPT_SZ); + h_CallResultGPU = (float*)malloc(OPT_SZ); + h_PutResultGPU = (float*)malloc(OPT_SZ); + h_StockPrice = (float*)malloc(OPT_SZ); + h_OptionStrike = (float*)malloc(OPT_SZ); + h_OptionYears = (float*)malloc(OPT_SZ); + + printf("...generating input data in CPU mem.\n"); srand(5347); // Generate options set - for (int i = 0; i < OPT_N; i++) { + for (i = 0; i < OPT_N; i++) { h_CallResultCPU[i] = 0.0f; h_PutResultCPU[i] = -1.0f; h_StockPrice[i] = RandFloat(5.0f, 30.0f); @@ -276,31 +301,56 @@ int main(int argc, char** argv) { /*******************************************************************************/ + checkCudaErrors(cudaDeviceSynchronize()); + sdkResetTimer(&hTimer); + sdkStartTimer(&hTimer); // Compute the values and derivatives of the price of the call options callGrad.execute(h_CallResultCPU, h_CallResultGPU, h_PutResultCPU, h_PutResultGPU, h_StockPrice, h_OptionStrike, h_OptionYears, d_CallResultGPU, d_StockPrice, d_OptionStrike, d_OptionYears); + checkCudaErrors(cudaDeviceSynchronize()); + sdkStopTimer(&hTimer); + gpuTime = sdkGetTimerValue(&hTimer) / NUM_ITERATIONS; + + // Both call and put is calculated + printf("Options count : %i \n", 2 * OPT_N); + printf("BlackScholesGPU() time : %f msec\n", gpuTime); + printf("Effective memory bandwidth: %f GB/s\n", + ((double)(5 * OPT_N * sizeof(float)) * 1E-9) / (gpuTime * 1E-3)); + printf("Gigaoptions per second : %f \n\n", + ((double)(2 * OPT_N) * 1E-9) / (gpuTime * 1E-3)); + + printf("BlackScholes, Throughput = %.4f GOptions/s, Time = %.5f s, Size = %u " + "options, NumDevsUsed = %u, Workgroup = %u\n", + (((double)(2.0 * OPT_N) * 1.0E-9) / (gpuTime * 1.0E-3)), + gpuTime * 1e-3, (2 * OPT_N), 1, 128); + + printf("Checking the results...\n"); + // Calculate max absolute difference and L1 distance + // between CPU and GPU results + printf("Comparing the results...\n"); // Calculate max absolute difference and L1 distance // between CPU and GPU results - double delta, ref, sum_delta, sum_ref, L1norm; sum_delta = 0; sum_ref = 0; + max_delta = 0; - for (int i = 0; i < OPT_N; i++) { + for (i = 0; i < OPT_N; i++) { ref = h_CallResultCPU[i]; delta = fabs(h_CallResultCPU[i] - h_CallResultGPU[i]); + + if (delta > max_delta) + max_delta = delta; + sum_delta += delta; sum_ref += fabs(ref); } L1norm = sum_delta / sum_ref; - printf("L1norm = %E\n", L1norm); - if (L1norm > 1e-6) { - printf("Original test failed\n"); - return EXIT_FAILURE; - } + printf("L1 norm: %E\n", L1norm); + printf("Max absolute error: %E\n\n", max_delta); // Verify delta computeL1norm(h_StockPrice, h_OptionStrike, h_OptionYears, @@ -344,6 +394,8 @@ int main(int argc, char** argv) { d_OptionYears); /*******************************************************************************/ + printf("Shutting down...\n"); + printf("...releasing CPU memory.\n"); free(h_OptionYears); free(h_OptionStrike); free(h_StockPrice); @@ -356,6 +408,19 @@ int main(int argc, char** argv) { free(d_StockPrice); free(d_PutResultGPU); free(d_CallResultGPU); + sdkDeleteTimer(&hTimer); - return EXIT_SUCCESS; -} + printf("Shutdown done.\n"); + + printf("\n[BlackScholes] - Test Summary\n"); + + if (L1norm > 1e-6) { + printf("Test failed!\n"); + exit(EXIT_FAILURE); + } + + printf("\nNOTE: The CUDA Samples are not meant for performance measurements. " + "Results may vary when GPU Boost is enabled.\n\n"); + printf("Test passed\n"); + exit(EXIT_SUCCESS); +} \ No newline at end of file From 7e81211defefffd75701004be1d053dc5bfa7d37 Mon Sep 17 00:00:00 2001 From: kchristin Date: Wed, 20 Nov 2024 19:37:43 +0200 Subject: [PATCH 10/18] Move verify gradient functions to another helper file --- demos/CUDA/BlackScholes/BlackScholes.cu | 119 +++--------------- .../BlackScholes/helper/helper_grad_verify.h | 110 ++++++++++++++++ 2 files changed, 124 insertions(+), 105 deletions(-) create mode 100644 demos/CUDA/BlackScholes/helper/helper_grad_verify.h diff --git a/demos/CUDA/BlackScholes/BlackScholes.cu b/demos/CUDA/BlackScholes/BlackScholes.cu index d3fda70ec..b5187be71 100644 --- a/demos/CUDA/BlackScholes/BlackScholes.cu +++ b/demos/CUDA/BlackScholes/BlackScholes.cu @@ -35,10 +35,12 @@ * DISCLAIMER: The following file has been slightly modified to ensure * compatibility with Clad and to serve as a Clad demo. Specifically, parts of * the original `main` function have been moved to a separate function to use - * `clad::gradient` on. Furthermore, original print statements have been removed - * and new helper functions are now included in the file to verify the - * gradient's results. The original file is available in NVIDIA's cuda-samples - * repository on GitHub. + * `clad::gradient` on. Furthermore, Clad cannot clone printf statements so some + * original print statements have been ommitted. The same applies to the + * checkCudaErrors function. + * New helper functions are included in another file and invoked here to verify + * the gradient's results. The original file is available in NVIDIA's + * cuda-samples repository on GitHub. * * Relevant documentation regarding the problem at hand can be found in NVIDIA's * cuda-samples repository. Using Clad, we compute some of the Greeks @@ -53,6 +55,7 @@ #include // helper functions CUDA error checking and initialization #include // helper functions for string parsing +#include //////////////////////////////////////////////////////////////////////////////// // Process an array of optN options on CPU @@ -77,17 +80,18 @@ float RandFloat(float low, float high) { return (1.0f - t) * low + t * high; } +// This section is included in the helper_grad_verify.h file //////////////////////////////////////////////////////////////////////////////// // Data configuration //////////////////////////////////////////////////////////////////////////////// -const int OPT_N = 4000000; -const int NUM_ITERATIONS = 512; +// const int OPT_N = 4000000; +// const int NUM_ITERATIONS = 512; -const int OPT_SZ = OPT_N * sizeof(float); -const float RISKFREE = 0.02f; -const float VOLATILITY = 0.30f; +// const int OPT_SZ = OPT_N * sizeof(float); +// const float RISKFREE = 0.02f; +// const float VOLATILITY = 0.30f; -#define DIV_UP(a, b) (((a) + (b) - 1) / (b)) +// #define DIV_UP(a, b) (((a) + (b) - 1) / (b)) //////////////////////////////////////////////////////////////////////////////// // Main program @@ -139,101 +143,6 @@ void launch(float* h_CallResultCPU, float* h_CallResultGPU, cudaFree(d_CallResult); } -double d1(double S, double X, double T) { - return (log(S / X) + (RISKFREE + 0.5 * VOLATILITY * VOLATILITY) * T) / - (VOLATILITY * sqrt(T)); -} - -double N_prime(double d) { - const double RSQRT2PI = - 0.39894228040143267793994605993438; // 1 / sqrt(2 * PI) - return RSQRT2PI * exp(-0.5 * d * d); -} - -enum Greek { Delta, dX, Theta }; - -enum OptionType { Call, Put }; - -template const char* getNameofOpt() { - if constexpr (opt == Call) - return "Call"; - if constexpr (opt == Put) - return "Put"; -} - -template const char* getNameOfGreek() { - if constexpr (greek == Delta) - return "Delta"; - if constexpr (greek == dX) - return "dStrike"; - if constexpr (greek == Theta) - return "Theta"; -} - -template -void computeL1norm(float* S, float* X, float* T, float* d) { - double delta, ref, sum_delta, sum_ref; - sum_delta = 0; - sum_ref = 0; - for (int i = 0; i < OPT_N; i++) { - if constexpr (opt == Call) { - if constexpr (greek == Delta) { - double d1_val = d1(S[i], X[i], T[i]); - ref = CND(d1_val); - } else if constexpr (greek == dX) { - double T_val = T[i]; - double d1_val = d1(S[i], X[i], T_val); - double d2_val = d1_val - VOLATILITY * sqrt(T_val); - double expRT = exp(-RISKFREE * T_val); - ref = -expRT * CND(d2_val); - } else if constexpr (greek == Theta) { - double S_val = S[i], X_val = X[i], T_val = T[i]; - double d1_val = d1(S_val, X_val, T_val); - double d2_val = d1_val - VOLATILITY * sqrt(T_val); - double expRT = exp(-RISKFREE * T_val); - ref = (S_val * N_prime(d1_val) * VOLATILITY) / (2 * sqrt(T_val)) + - RISKFREE * X_val * expRT * - CND(d2_val); // theta is with respect to t, so -theta is the - // approximation of the derivative with respect - // to T - } - } else if constexpr (opt == Put) { - if constexpr (greek == Delta) { - double d1_val = d1(S[i], X[i], T[i]); - ref = CND(d1_val) - 1.0; - } else if constexpr (greek == dX) { - double T_val = T[i]; - double d1_val = d1(S[i], X[i], T_val); - double d2_val = d1_val - VOLATILITY * sqrt(T_val); - double expRT = exp(-RISKFREE * T_val); - ref = expRT * CND(-d2_val); - } else if constexpr (greek == Theta) { - double S_val = S[i], X_val = X[i], T_val = T[i]; - double d1_val = d1(S_val, X_val, T_val); - double d2_val = d1_val - VOLATILITY * sqrt(T_val); - double expRT = exp(-RISKFREE * T_val); - ref = (S_val * N_prime(d1_val) * VOLATILITY) / (2 * sqrt(T_val)) - - RISKFREE * X_val * expRT * CND(-d2_val); - } - } - - delta = fabs(d[i] - ref); - sum_delta += delta; - sum_ref += fabs(ref); - } - - double L1norm = sum_delta / sum_ref; - printf("L1norm of %s for %s option = %E\n", getNameOfGreek(), - getNameofOpt(), L1norm); - if (L1norm > 1e-5) { - printf( - "Gradient test failed: Difference between %s's computed and " - "approximated theoretical values for %s option is larger than expected", - getNameOfGreek(), getNameofOpt()); - exit(EXIT_FAILURE); - } -} - int main(int argc, char** argv) { // Start logs printf("[%s] - Starting...\n", argv[0]); diff --git a/demos/CUDA/BlackScholes/helper/helper_grad_verify.h b/demos/CUDA/BlackScholes/helper/helper_grad_verify.h new file mode 100644 index 000000000..804079ce9 --- /dev/null +++ b/demos/CUDA/BlackScholes/helper/helper_grad_verify.h @@ -0,0 +1,110 @@ +#include + +extern "C" double CND(double d); + +//////////////////////////////////////////////////////////////////////////////// +// Data configuration +//////////////////////////////////////////////////////////////////////////////// +const int OPT_N = 4000000; +const int NUM_ITERATIONS = 512; + +const int OPT_SZ = OPT_N * sizeof(float); +const float RISKFREE = 0.02f; +const float VOLATILITY = 0.30f; + +#define DIV_UP(a, b) (((a) + (b)-1) / (b)) + +double d1(double S, double X, double T) { + return (log(S / X) + (RISKFREE + 0.5 * VOLATILITY * VOLATILITY) * T) / + (VOLATILITY * sqrt(T)); +} + +double N_prime(double d) { + const double RSQRT2PI = + 0.39894228040143267793994605993438; // 1 / sqrt(2 * PI) + return RSQRT2PI * exp(-0.5 * d * d); +} + +enum Greek { Delta, dX, Theta }; + +enum OptionType { Call, Put }; + +template const char* getNameofOpt() { + if constexpr (opt == Call) + return "Call"; + if constexpr (opt == Put) + return "Put"; +} + +template const char* getNameOfGreek() { + if constexpr (greek == Delta) + return "Delta"; + if constexpr (greek == dX) + return "dStrike"; + if constexpr (greek == Theta) + return "Theta"; +} + +template +void computeL1norm(float* S, float* X, float* T, float* d) { + double delta, ref, sum_delta, sum_ref; + sum_delta = 0; + sum_ref = 0; + for (int i = 0; i < OPT_N; i++) { + if constexpr (opt == Call) { + if constexpr (greek == Delta) { + double d1_val = d1(S[i], X[i], T[i]); + ref = CND(d1_val); + } else if constexpr (greek == dX) { + double T_val = T[i]; + double d1_val = d1(S[i], X[i], T_val); + double d2_val = d1_val - VOLATILITY * sqrt(T_val); + double expRT = exp(-RISKFREE * T_val); + ref = -expRT * CND(d2_val); + } else if constexpr (greek == Theta) { + double S_val = S[i], X_val = X[i], T_val = T[i]; + double d1_val = d1(S_val, X_val, T_val); + double d2_val = d1_val - VOLATILITY * sqrt(T_val); + double expRT = exp(-RISKFREE * T_val); + ref = (S_val * N_prime(d1_val) * VOLATILITY) / (2 * sqrt(T_val)) + + RISKFREE * X_val * expRT * + CND(d2_val); // theta is with respect to t, so -theta is the + // approximation of the derivative with respect + // to T + } + } else if constexpr (opt == Put) { + if constexpr (greek == Delta) { + double d1_val = d1(S[i], X[i], T[i]); + ref = CND(d1_val) - 1.0; + } else if constexpr (greek == dX) { + double T_val = T[i]; + double d1_val = d1(S[i], X[i], T_val); + double d2_val = d1_val - VOLATILITY * sqrt(T_val); + double expRT = exp(-RISKFREE * T_val); + ref = expRT * CND(-d2_val); + } else if constexpr (greek == Theta) { + double S_val = S[i], X_val = X[i], T_val = T[i]; + double d1_val = d1(S_val, X_val, T_val); + double d2_val = d1_val - VOLATILITY * sqrt(T_val); + double expRT = exp(-RISKFREE * T_val); + ref = (S_val * N_prime(d1_val) * VOLATILITY) / (2 * sqrt(T_val)) - + RISKFREE * X_val * expRT * CND(-d2_val); + } + } + + delta = fabs(d[i] - ref); + sum_delta += delta; + sum_ref += fabs(ref); + } + + double L1norm = sum_delta / sum_ref; + printf("L1norm of %s for %s option = %E\n", getNameOfGreek(), + getNameofOpt(), L1norm); + if (L1norm > 1e-5) { + printf( + "Gradient test failed: Difference between %s's computed and " + "approximated theoretical values for %s option is larger than expected", + getNameOfGreek(), getNameofOpt()); + exit(EXIT_FAILURE); + } +} From 3349476a19ab0bfd96ff975549fb40dbfa5985ba Mon Sep 17 00:00:00 2001 From: kchristin Date: Wed, 20 Nov 2024 19:39:55 +0200 Subject: [PATCH 11/18] Fix format --- demos/CUDA/BlackScholes/helper/helper_grad_verify.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demos/CUDA/BlackScholes/helper/helper_grad_verify.h b/demos/CUDA/BlackScholes/helper/helper_grad_verify.h index 804079ce9..1b4ad8da1 100644 --- a/demos/CUDA/BlackScholes/helper/helper_grad_verify.h +++ b/demos/CUDA/BlackScholes/helper/helper_grad_verify.h @@ -12,7 +12,7 @@ const int OPT_SZ = OPT_N * sizeof(float); const float RISKFREE = 0.02f; const float VOLATILITY = 0.30f; -#define DIV_UP(a, b) (((a) + (b)-1) / (b)) +#define DIV_UP(a, b) (((a) + (b) - 1) / (b)) double d1(double S, double X, double T) { return (log(S / X) + (RISKFREE + 0.5 * VOLATILITY * VOLATILITY) * T) / From d7521c3dbce4e96f7d705145e405ae257698c243 Mon Sep 17 00:00:00 2001 From: kchristin Date: Wed, 20 Nov 2024 19:43:06 +0200 Subject: [PATCH 12/18] Fix typo --- demos/CUDA/BlackScholes/BlackScholes.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/demos/CUDA/BlackScholes/BlackScholes.cu b/demos/CUDA/BlackScholes/BlackScholes.cu index b5187be71..c9aa6df6c 100644 --- a/demos/CUDA/BlackScholes/BlackScholes.cu +++ b/demos/CUDA/BlackScholes/BlackScholes.cu @@ -35,8 +35,8 @@ * DISCLAIMER: The following file has been slightly modified to ensure * compatibility with Clad and to serve as a Clad demo. Specifically, parts of * the original `main` function have been moved to a separate function to use - * `clad::gradient` on. Furthermore, Clad cannot clone printf statements so some - * original print statements have been ommitted. The same applies to the + * `clad::gradient` on. Furthermore, Clad cannot clone printf statements, so some + * original print statements have been omitted. The same applies to the * checkCudaErrors function. * New helper functions are included in another file and invoked here to verify * the gradient's results. The original file is available in NVIDIA's From 7fcd1398681ec7b53c3843404bdf6488093370d0 Mon Sep 17 00:00:00 2001 From: kchristin Date: Wed, 20 Nov 2024 20:19:15 +0200 Subject: [PATCH 13/18] Fix format --- demos/CUDA/BlackScholes/BlackScholes.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/demos/CUDA/BlackScholes/BlackScholes.cu b/demos/CUDA/BlackScholes/BlackScholes.cu index c9aa6df6c..6e9ca4f0d 100644 --- a/demos/CUDA/BlackScholes/BlackScholes.cu +++ b/demos/CUDA/BlackScholes/BlackScholes.cu @@ -35,8 +35,8 @@ * DISCLAIMER: The following file has been slightly modified to ensure * compatibility with Clad and to serve as a Clad demo. Specifically, parts of * the original `main` function have been moved to a separate function to use - * `clad::gradient` on. Furthermore, Clad cannot clone printf statements, so some - * original print statements have been omitted. The same applies to the + * `clad::gradient` on. Furthermore, Clad cannot clone printf statements, so + * some original print statements have been omitted. The same applies to the * checkCudaErrors function. * New helper functions are included in another file and invoked here to verify * the gradient's results. The original file is available in NVIDIA's From 2ca1f5f0b22282c71afaf97e4b7174be34568bf0 Mon Sep 17 00:00:00 2001 From: kchristin Date: Wed, 20 Nov 2024 23:42:14 +0200 Subject: [PATCH 14/18] Add missing print statements in function to hand to Clad --- demos/CUDA/BlackScholes/BlackScholes.cu | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/demos/CUDA/BlackScholes/BlackScholes.cu b/demos/CUDA/BlackScholes/BlackScholes.cu index 6e9ca4f0d..833401e1b 100644 --- a/demos/CUDA/BlackScholes/BlackScholes.cu +++ b/demos/CUDA/BlackScholes/BlackScholes.cu @@ -35,12 +35,11 @@ * DISCLAIMER: The following file has been slightly modified to ensure * compatibility with Clad and to serve as a Clad demo. Specifically, parts of * the original `main` function have been moved to a separate function to use - * `clad::gradient` on. Furthermore, Clad cannot clone printf statements, so - * some original print statements have been omitted. The same applies to the - * checkCudaErrors function. - * New helper functions are included in another file and invoked here to verify - * the gradient's results. The original file is available in NVIDIA's - * cuda-samples repository on GitHub. + * `clad::gradient` on. Furthermore, Clad cannot clone checkCudaErrors + * successfully, so these calls have been omitted. The same applies to the + * cudaDeviceSynchronize function. New helper functions are included in another + * file and invoked here to verify the gradient's results. The original file is + * available in NVIDIA's cuda-samples repository on GitHub. * * Relevant documentation regarding the problem at hand can be found in NVIDIA's * cuda-samples repository. Using Clad, we compute some of the Greeks @@ -110,6 +109,7 @@ void launch(float* h_CallResultCPU, float* h_CallResultGPU, *d_StockPrice = nullptr, *d_OptionStrike = nullptr, *d_OptionYears = nullptr; + printf("...allocating GPU memory for options.\n"); cudaMalloc((void**)&d_CallResult, OPT_SZ); cudaMalloc((void**)&d_PutResult, OPT_SZ); cudaMalloc((void**)&d_StockPrice, OPT_SZ); @@ -117,9 +117,14 @@ void launch(float* h_CallResultCPU, float* h_CallResultGPU, cudaMalloc((void**)&d_OptionYears, OPT_SZ); // Copy options data to GPU memory for further processing + printf("...copying input data to GPU mem.\n"); cudaMemcpy(d_StockPrice, h_StockPrice, OPT_SZ, cudaMemcpyHostToDevice); cudaMemcpy(d_OptionStrike, h_OptionStrike, OPT_SZ, cudaMemcpyHostToDevice); cudaMemcpy(d_OptionYears, h_OptionYears, OPT_SZ, cudaMemcpyHostToDevice); + printf("Data init done.\n\n"); + + printf("Executing Black-Scholes GPU kernel (%i iterations)...\n", + NUM_ITERATIONS); BlackScholesGPU<<>>( (float2*)d_CallResult, (float2*)d_PutResult, (float2*)d_StockPrice, @@ -128,14 +133,19 @@ void launch(float* h_CallResultCPU, float* h_CallResultGPU, // Both call and put is calculated + printf("\nReading back GPU results...\n"); // Read back GPU results to compare them to CPU results cudaMemcpy(h_CallResultGPU, d_CallResult, OPT_SZ, cudaMemcpyDeviceToHost); cudaMemcpy(h_PutResultGPU, d_PutResult, OPT_SZ, cudaMemcpyDeviceToHost); + // Calculate options values on CPU + printf("Checking the results...\n"); + printf("...running CPU calculations.\n\n"); // Calculate options values on CPU BlackScholesCPU(h_CallResultCPU, h_PutResultCPU, h_StockPrice, h_OptionStrike, h_OptionYears, RISKFREE, VOLATILITY, OPT_N); + printf("...releasing GPU memory.\n"); cudaFree(d_OptionYears); cudaFree(d_OptionStrike); cudaFree(d_StockPrice); From 11183f497f39084a6d97edaa772e16ddbf408b8f Mon Sep 17 00:00:00 2001 From: kchristin Date: Fri, 22 Nov 2024 18:53:36 +0200 Subject: [PATCH 15/18] Add for loop for mutliple iters of GPU kernel --- demos/CUDA/BlackScholes/BlackScholes.cu | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/demos/CUDA/BlackScholes/BlackScholes.cu b/demos/CUDA/BlackScholes/BlackScholes.cu index 833401e1b..3ac8dce8f 100644 --- a/demos/CUDA/BlackScholes/BlackScholes.cu +++ b/demos/CUDA/BlackScholes/BlackScholes.cu @@ -126,10 +126,12 @@ void launch(float* h_CallResultCPU, float* h_CallResultGPU, printf("Executing Black-Scholes GPU kernel (%i iterations)...\n", NUM_ITERATIONS); - BlackScholesGPU<<>>( - (float2*)d_CallResult, (float2*)d_PutResult, (float2*)d_StockPrice, - (float2*)d_OptionStrike, (float2*)d_OptionYears, RISKFREE, VOLATILITY, - OPT_N); + for (int i = 0; i < NUM_ITERATIONS; i++) { + BlackScholesGPU<<>>( + (float2*)d_CallResult, (float2*)d_PutResult, (float2*)d_StockPrice, + (float2*)d_OptionStrike, (float2*)d_OptionYears, RISKFREE, VOLATILITY, + OPT_N); + } // Both call and put is calculated From fa41bff61f49a027007941e206cd02c355180e44 Mon Sep 17 00:00:00 2001 From: kchristin Date: Fri, 22 Nov 2024 20:19:22 +0200 Subject: [PATCH 16/18] Keep changes close together and move CPU function call to main --- demos/CUDA/BlackScholes/BlackScholes.cu | 175 +++++++----------- .../BlackScholes/helper/helper_grad_verify.h | 12 -- 2 files changed, 62 insertions(+), 125 deletions(-) diff --git a/demos/CUDA/BlackScholes/BlackScholes.cu b/demos/CUDA/BlackScholes/BlackScholes.cu index 3ac8dce8f..28949797b 100644 --- a/demos/CUDA/BlackScholes/BlackScholes.cu +++ b/demos/CUDA/BlackScholes/BlackScholes.cu @@ -31,39 +31,16 @@ * See supplied whitepaper for more explanations. */ -/* - * DISCLAIMER: The following file has been slightly modified to ensure - * compatibility with Clad and to serve as a Clad demo. Specifically, parts of - * the original `main` function have been moved to a separate function to use - * `clad::gradient` on. Furthermore, Clad cannot clone checkCudaErrors - * successfully, so these calls have been omitted. The same applies to the - * cudaDeviceSynchronize function. New helper functions are included in another - * file and invoked here to verify the gradient's results. The original file is - * available in NVIDIA's cuda-samples repository on GitHub. - * - * Relevant documentation regarding the problem at hand can be found in NVIDIA's - * cuda-samples repository. Using Clad, we compute some of the Greeks - * (sensitivities) for the Black-Scholes model and verify them against - * approximations of their theoretical values as denoted in Wikipedia - * (https://en.wikipedia.org/wiki/Black%E2%80%93Scholes_model). - * - * To build and run the demo, use the following command: make run - */ - -#include "clad/Differentiator/Differentiator.h" - -#include // helper functions CUDA error checking and initialization -#include // helper functions for string parsing -#include +#include // helper functions for string parsing +#include // helper functions CUDA error checking and initialization //////////////////////////////////////////////////////////////////////////////// // Process an array of optN options on CPU //////////////////////////////////////////////////////////////////////////////// -extern "C" void BlackScholesCPU(float* h_CallResult, float* h_PutResult, - float* h_StockPrice, float* h_OptionStrike, - float* h_OptionYears, float Riskfree, +extern "C" void BlackScholesCPU(float *h_CallResult, float *h_PutResult, + float *h_StockPrice, float *h_OptionStrike, + float *h_OptionYears, float Riskfree, float Volatility, int optN); -extern "C" double CND(double d); //////////////////////////////////////////////////////////////////////////////// // Process an array of OptN options on GPU @@ -79,23 +56,44 @@ float RandFloat(float low, float high) { return (1.0f - t) * low + t * high; } -// This section is included in the helper_grad_verify.h file //////////////////////////////////////////////////////////////////////////////// // Data configuration //////////////////////////////////////////////////////////////////////////////// -// const int OPT_N = 4000000; -// const int NUM_ITERATIONS = 512; +const int OPT_N = 4000000; +const int NUM_ITERATIONS = 512; -// const int OPT_SZ = OPT_N * sizeof(float); -// const float RISKFREE = 0.02f; -// const float VOLATILITY = 0.30f; +const int OPT_SZ = OPT_N * sizeof(float); +const float RISKFREE = 0.02f; +const float VOLATILITY = 0.30f; -// #define DIV_UP(a, b) (((a) + (b) - 1) / (b)) +#define DIV_UP(a, b) (((a) + (b)-1) / (b)) //////////////////////////////////////////////////////////////////////////////// // Main program //////////////////////////////////////////////////////////////////////////////// +/* + * DISCLAIMER: The following file has been slightly modified to ensure + * compatibility with Clad and to serve as a Clad demo. Specifically, parts of + * the original `main` function have been moved to a separate function to use + * `clad::gradient` on. Furthermore, Clad cannot clone checkCudaErrors + * successfully, so these calls have been omitted. The same applies to the + * cudaDeviceSynchronize function. New helper functions are included in another + * file and invoked here to verify the gradient's results. The original file is + * available in NVIDIA's cuda-samples repository on GitHub. + * + * Relevant documentation regarding the problem at hand can be found in NVIDIA's + * cuda-samples repository. Using Clad, we compute some of the Greeks + * (sensitivities) for the Black-Scholes model and verify them against + * approximations of their theoretical values as denoted in Wikipedia + * (https://en.wikipedia.org/wiki/Black%E2%80%93Scholes_model). + * + * To build and run the demo, use the following command: make run + */ + +#include "clad/Differentiator/Differentiator.h" +#include + void launch(float* h_CallResultCPU, float* h_CallResultGPU, float* h_PutResultCPU, float* h_PutResultGPU, float* h_StockPrice, float* h_OptionStrike, float* h_OptionYears) { @@ -106,7 +104,7 @@ void launch(float* h_CallResultCPU, float* h_CallResultGPU, *d_CallResult = nullptr, *d_PutResult = nullptr, // GPU instance of input data - *d_StockPrice = nullptr, *d_OptionStrike = nullptr, + *d_StockPrice = nullptr, *d_OptionStrike = nullptr, *d_OptionYears = nullptr; printf("...allocating GPU memory for options.\n"); @@ -125,11 +123,11 @@ void launch(float* h_CallResultCPU, float* h_CallResultGPU, printf("Executing Black-Scholes GPU kernel (%i iterations)...\n", NUM_ITERATIONS); - - for (int i = 0; i < NUM_ITERATIONS; i++) { + int i; + for (i = 0; i < NUM_ITERATIONS; i++) { BlackScholesGPU<<>>( - (float2*)d_CallResult, (float2*)d_PutResult, (float2*)d_StockPrice, - (float2*)d_OptionStrike, (float2*)d_OptionYears, RISKFREE, VOLATILITY, + (float2 *)d_CallResult, (float2 *)d_PutResult, (float2 *)d_StockPrice, + (float2 *)d_OptionStrike, (float2 *)d_OptionYears, RISKFREE, VOLATILITY, OPT_N); } @@ -140,13 +138,6 @@ void launch(float* h_CallResultCPU, float* h_CallResultGPU, cudaMemcpy(h_CallResultGPU, d_CallResult, OPT_SZ, cudaMemcpyDeviceToHost); cudaMemcpy(h_PutResultGPU, d_PutResult, OPT_SZ, cudaMemcpyDeviceToHost); - // Calculate options values on CPU - printf("Checking the results...\n"); - printf("...running CPU calculations.\n\n"); - // Calculate options values on CPU - BlackScholesCPU(h_CallResultCPU, h_PutResultCPU, h_StockPrice, h_OptionStrike, - h_OptionYears, RISKFREE, VOLATILITY, OPT_N); - printf("...releasing GPU memory.\n"); cudaFree(d_OptionYears); cudaFree(d_OptionStrike); @@ -171,22 +162,22 @@ int main(int argc, char** argv) { double delta, ref, sum_delta, sum_ref, max_delta, L1norm, gpuTime; - StopWatchInterface* hTimer = NULL; + StopWatchInterface *hTimer = NULL; int i; - findCudaDevice(argc, (const char**)argv); + findCudaDevice(argc, (const char **)argv); sdkCreateTimer(&hTimer); printf("Initializing data...\n"); printf("...allocating CPU memory for options.\n"); - h_CallResultCPU = (float*)malloc(OPT_SZ); - h_PutResultCPU = (float*)malloc(OPT_SZ); - h_CallResultGPU = (float*)malloc(OPT_SZ); - h_PutResultGPU = (float*)malloc(OPT_SZ); - h_StockPrice = (float*)malloc(OPT_SZ); - h_OptionStrike = (float*)malloc(OPT_SZ); - h_OptionYears = (float*)malloc(OPT_SZ); + h_CallResultCPU = (float *)malloc(OPT_SZ); + h_PutResultCPU = (float *)malloc(OPT_SZ); + h_CallResultGPU = (float *)malloc(OPT_SZ); + h_PutResultGPU = (float *)malloc(OPT_SZ); + h_StockPrice = (float *)malloc(OPT_SZ); + h_OptionStrike = (float *)malloc(OPT_SZ); + h_OptionYears = (float *)malloc(OPT_SZ); printf("...generating input data in CPU mem.\n"); srand(5347); @@ -243,14 +234,18 @@ int main(int argc, char** argv) { printf("Gigaoptions per second : %f \n\n", ((double)(2 * OPT_N) * 1E-9) / (gpuTime * 1E-3)); - printf("BlackScholes, Throughput = %.4f GOptions/s, Time = %.5f s, Size = %u " - "options, NumDevsUsed = %u, Workgroup = %u\n", - (((double)(2.0 * OPT_N) * 1.0E-9) / (gpuTime * 1.0E-3)), - gpuTime * 1e-3, (2 * OPT_N), 1, 128); + printf( + "BlackScholes, Throughput = %.4f GOptions/s, Time = %.5f s, Size = %u " + "options, NumDevsUsed = %u, Workgroup = %u\n", + (((double)(2.0 * OPT_N) * 1.0E-9) / (gpuTime * 1.0E-3)), gpuTime * 1e-3, + (2 * OPT_N), 1, 128); printf("Checking the results...\n"); - // Calculate max absolute difference and L1 distance - // between CPU and GPU results + printf("...running CPU calculations.\n\n"); + // Calculate options values on CPU + BlackScholesCPU(h_CallResultCPU, h_PutResultCPU, h_StockPrice, h_OptionStrike, + h_OptionYears, RISKFREE, VOLATILITY, OPT_N); + printf("Comparing the results...\n"); // Calculate max absolute difference and L1 distance // between CPU and GPU results @@ -262,8 +257,9 @@ int main(int argc, char** argv) { ref = h_CallResultCPU[i]; delta = fabs(h_CallResultCPU[i] - h_CallResultGPU[i]); - if (delta > max_delta) + if (delta > max_delta) { max_delta = delta; + } sum_delta += delta; sum_ref += fabs(ref); @@ -273,48 +269,6 @@ int main(int argc, char** argv) { printf("L1 norm: %E\n", L1norm); printf("Max absolute error: %E\n\n", max_delta); - // Verify delta - computeL1norm(h_StockPrice, h_OptionStrike, h_OptionYears, - d_StockPrice); - // Verify derivatives with respect to the Strike price - computeL1norm(h_StockPrice, h_OptionStrike, h_OptionYears, - d_OptionStrike); - // Verify theta - computeL1norm(h_StockPrice, h_OptionStrike, h_OptionYears, - d_OptionYears); - - /*******************************************************************************/ - - // Re-initialize data for next gradient call - for (int i = 0; i < OPT_N; i++) { - h_CallResultCPU[i] = 0.0f; - h_PutResultCPU[i] = -1.0f; - d_CallResultGPU[i] = 1.0f; - d_PutResultGPU[i] = 1.0f; - } - - for (int i = 0; i < OPT_N; i++) { - d_StockPrice[i] = 0.f; - d_OptionStrike[i] = 0.f; - d_OptionYears[i] = 0.f; - } - - // Compute the values and derivatives of the price of the Put options - putGrad.execute(h_CallResultCPU, h_CallResultGPU, h_PutResultCPU, - h_PutResultGPU, h_StockPrice, h_OptionStrike, h_OptionYears, - d_PutResultGPU, d_StockPrice, d_OptionStrike, d_OptionYears); - - // Verify delta - computeL1norm(h_StockPrice, h_OptionStrike, h_OptionYears, - d_StockPrice); - // Verify derivatives with respect to the Strike price - computeL1norm(h_StockPrice, h_OptionStrike, h_OptionYears, - d_OptionStrike); - // Verify theta - computeL1norm(h_StockPrice, h_OptionStrike, h_OptionYears, - d_OptionYears); - - /*******************************************************************************/ printf("Shutting down...\n"); printf("...releasing CPU memory.\n"); free(h_OptionYears); @@ -324,13 +278,7 @@ int main(int argc, char** argv) { free(h_CallResultGPU); free(h_PutResultCPU); free(h_CallResultCPU); - free(d_OptionYears); - free(d_OptionStrike); - free(d_StockPrice); - free(d_PutResultGPU); - free(d_CallResultGPU); sdkDeleteTimer(&hTimer); - printf("Shutdown done.\n"); printf("\n[BlackScholes] - Test Summary\n"); @@ -340,8 +288,9 @@ int main(int argc, char** argv) { exit(EXIT_FAILURE); } - printf("\nNOTE: The CUDA Samples are not meant for performance measurements. " - "Results may vary when GPU Boost is enabled.\n\n"); + printf( + "\nNOTE: The CUDA Samples are not meant for performance measurements. " + "Results may vary when GPU Boost is enabled.\n\n"); printf("Test passed\n"); exit(EXIT_SUCCESS); } \ No newline at end of file diff --git a/demos/CUDA/BlackScholes/helper/helper_grad_verify.h b/demos/CUDA/BlackScholes/helper/helper_grad_verify.h index 1b4ad8da1..efa4bdf65 100644 --- a/demos/CUDA/BlackScholes/helper/helper_grad_verify.h +++ b/demos/CUDA/BlackScholes/helper/helper_grad_verify.h @@ -2,18 +2,6 @@ extern "C" double CND(double d); -//////////////////////////////////////////////////////////////////////////////// -// Data configuration -//////////////////////////////////////////////////////////////////////////////// -const int OPT_N = 4000000; -const int NUM_ITERATIONS = 512; - -const int OPT_SZ = OPT_N * sizeof(float); -const float RISKFREE = 0.02f; -const float VOLATILITY = 0.30f; - -#define DIV_UP(a, b) (((a) + (b) - 1) / (b)) - double d1(double S, double X, double T) { return (log(S / X) + (RISKFREE + 0.5 * VOLATILITY * VOLATILITY) * T) / (VOLATILITY * sqrt(T)); From c47d6856910a4b2c54031983a80b76a7023e1926 Mon Sep 17 00:00:00 2001 From: kchristin Date: Fri, 22 Nov 2024 20:27:25 +0200 Subject: [PATCH 17/18] Update disclaimer with timer note --- demos/CUDA/BlackScholes/BlackScholes.cu | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/demos/CUDA/BlackScholes/BlackScholes.cu b/demos/CUDA/BlackScholes/BlackScholes.cu index 28949797b..e5ec574bf 100644 --- a/demos/CUDA/BlackScholes/BlackScholes.cu +++ b/demos/CUDA/BlackScholes/BlackScholes.cu @@ -79,8 +79,12 @@ const float VOLATILITY = 0.30f; * `clad::gradient` on. Furthermore, Clad cannot clone checkCudaErrors * successfully, so these calls have been omitted. The same applies to the * cudaDeviceSynchronize function. New helper functions are included in another - * file and invoked here to verify the gradient's results. The original file is - * available in NVIDIA's cuda-samples repository on GitHub. + * file and invoked here to verify the gradient's results. Since Clad cannot + * handle timers at the moment, the time measurement is included in + * `main` and doesn't time exclusively the original kernel execution, but the + * whole `launch` function and its gradient are timed in this version. + * + * The original file is available in NVIDIA's cuda-samples repository on GitHub. * * Relevant documentation regarding the problem at hand can be found in NVIDIA's * cuda-samples repository. Using Clad, we compute some of the Greeks From b8d7c1cbde01e57ec793f3c5bbe54fa433de6108 Mon Sep 17 00:00:00 2001 From: kchristin Date: Fri, 22 Nov 2024 20:33:46 +0200 Subject: [PATCH 18/18] Add put option execution and deallocation of derivatives again --- demos/CUDA/BlackScholes/BlackScholes.cu | 46 ++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/demos/CUDA/BlackScholes/BlackScholes.cu b/demos/CUDA/BlackScholes/BlackScholes.cu index e5ec574bf..5071b5c2d 100644 --- a/demos/CUDA/BlackScholes/BlackScholes.cu +++ b/demos/CUDA/BlackScholes/BlackScholes.cu @@ -150,7 +150,7 @@ void launch(float* h_CallResultCPU, float* h_CallResultGPU, cudaFree(d_CallResult); } -int main(int argc, char** argv) { +int main(int argc, char **argv) { // Start logs printf("[%s] - Starting...\n", argv[0]); @@ -273,6 +273,45 @@ int main(int argc, char** argv) { printf("L1 norm: %E\n", L1norm); printf("Max absolute error: %E\n\n", max_delta); + // Verify delta + computeL1norm(h_StockPrice, h_OptionStrike, h_OptionYears, + d_StockPrice); + // Verify derivatives with respect to the Strike price + computeL1norm(h_StockPrice, h_OptionStrike, h_OptionYears, + d_OptionStrike); + // Verify theta + computeL1norm(h_StockPrice, h_OptionStrike, h_OptionYears, + d_OptionYears); + /*******************************************************************************/ + // Re-initialize data for next gradient call + for (int i = 0; i < OPT_N; i++) + { + h_CallResultCPU[i] = 0.0f; + h_PutResultCPU[i] = -1.0f; + d_CallResultGPU[i] = 1.0f; + d_PutResultGPU[i] = 1.0f; + } + for (int i = 0; i < OPT_N; i++) + { + d_StockPrice[i] = 0.f; + d_OptionStrike[i] = 0.f; + d_OptionYears[i] = 0.f; + } + // Compute the values and derivatives of the price of the Put options + putGrad.execute(h_CallResultCPU, h_CallResultGPU, h_PutResultCPU, + h_PutResultGPU, h_StockPrice, h_OptionStrike, h_OptionYears, + d_PutResultGPU, d_StockPrice, d_OptionStrike, d_OptionYears); + // Verify delta + computeL1norm(h_StockPrice, h_OptionStrike, h_OptionYears, + d_StockPrice); + // Verify derivatives with respect to the Strike price + computeL1norm(h_StockPrice, h_OptionStrike, h_OptionYears, + d_OptionStrike); + // Verify theta + computeL1norm(h_StockPrice, h_OptionStrike, h_OptionYears, + d_OptionYears); + /*******************************************************************************/ + printf("Shutting down...\n"); printf("...releasing CPU memory.\n"); free(h_OptionYears); @@ -282,6 +321,11 @@ int main(int argc, char** argv) { free(h_CallResultGPU); free(h_PutResultCPU); free(h_CallResultCPU); + free(d_OptionYears); + free(d_OptionStrike); + free(d_StockPrice); + free(d_PutResultGPU); + free(d_CallResultGPU); sdkDeleteTimer(&hTimer); printf("Shutdown done.\n");