diff --git a/demos/CUDA/BlackScholes/BlackScholes.cu b/demos/CUDA/BlackScholes/BlackScholes.cu index 0f4ecd44c..73dfe4f2b 100644 --- a/demos/CUDA/BlackScholes/BlackScholes.cu +++ b/demos/CUDA/BlackScholes/BlackScholes.cu @@ -31,6 +31,8 @@ * See supplied whitepaper for more explanations. */ +#include "clad/Differentiator/Differentiator.h" + #include // helper functions for string parsing #include // helper functions CUDA error checking and initialization @@ -71,59 +73,66 @@ const float VOLATILITY = 0.30f; //////////////////////////////////////////////////////////////////////////////// // Main program //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - // Start logs - printf("[%s] - Starting...\n", argv[0]); - //'h_' prefix - CPU (host) memory space - float - // Results calculated by CPU for reference - *h_CallResultCPU, - *h_PutResultCPU, - // CPU copy of GPU results - *h_CallResultGPU, *h_PutResultGPU, - // CPU instance of input data - *h_StockPrice, *h_OptionStrike, *h_OptionYears; +void launch(float* h_CallResultCPU, float* h_CallResultGPU, + float* h_PutResultCPU, float* h_PutResultGPU, float* h_StockPrice, + float* h_OptionStrike, float* h_OptionYears) { //'d_' prefix - GPU (device) memory space float // Results calculated by GPU - *d_CallResult, - *d_PutResult, + *d_CallResult = nullptr, + *d_PutResult = nullptr, // GPU instance of input data - *d_StockPrice, *d_OptionStrike, *d_OptionYears; + *d_StockPrice = nullptr, *d_OptionStrike = nullptr, + *d_OptionYears = nullptr; + + cudaMalloc((void**)&d_CallResult, OPT_SZ); + cudaMalloc((void**)&d_PutResult, OPT_SZ); + cudaMalloc((void**)&d_StockPrice, OPT_SZ); + cudaMalloc((void**)&d_OptionStrike, OPT_SZ); + cudaMalloc((void**)&d_OptionYears, OPT_SZ); - double delta, ref, sum_delta, sum_ref, max_delta, L1norm, gpuTime; + // Copy options data to GPU memory for further processing + cudaMemcpy(d_StockPrice, h_StockPrice, OPT_SZ, cudaMemcpyHostToDevice); + cudaMemcpy(d_OptionStrike, h_OptionStrike, OPT_SZ, cudaMemcpyHostToDevice); + cudaMemcpy(d_OptionYears, h_OptionYears, OPT_SZ, cudaMemcpyHostToDevice); + + BlackScholesGPU<<>>( + (float2*)d_CallResult, (float2*)d_PutResult, (float2*)d_StockPrice, + (float2*)d_OptionStrike, (float2*)d_OptionYears, RISKFREE, VOLATILITY, + OPT_N); - StopWatchInterface *hTimer = NULL; - int i; + // Both call and put is calculated - findCudaDevice(argc, (const char **)argv); + // Read back GPU results to compare them to CPU results + cudaMemcpy(h_CallResultGPU, d_CallResult, OPT_SZ, cudaMemcpyDeviceToHost); + cudaMemcpy(h_PutResultGPU, d_PutResult, OPT_SZ, cudaMemcpyDeviceToHost); - sdkCreateTimer(&hTimer); + // Calculate options values on CPU + BlackScholesCPU(h_CallResultCPU, h_PutResultCPU, h_StockPrice, h_OptionStrike, + h_OptionYears, RISKFREE, VOLATILITY, OPT_N); - printf("Initializing data...\n"); - printf("...allocating CPU memory for options.\n"); - h_CallResultCPU = (float *)malloc(OPT_SZ); - h_PutResultCPU = (float *)malloc(OPT_SZ); - h_CallResultGPU = (float *)malloc(OPT_SZ); - h_PutResultGPU = (float *)malloc(OPT_SZ); - h_StockPrice = (float *)malloc(OPT_SZ); - h_OptionStrike = (float *)malloc(OPT_SZ); - h_OptionYears = (float *)malloc(OPT_SZ); + cudaFree(d_OptionYears); + cudaFree(d_OptionStrike); + cudaFree(d_StockPrice); + cudaFree(d_PutResult); + cudaFree(d_CallResult); +} - printf("...allocating GPU memory for options.\n"); - checkCudaErrors(cudaMalloc((void **)&d_CallResult, OPT_SZ)); - checkCudaErrors(cudaMalloc((void **)&d_PutResult, OPT_SZ)); - checkCudaErrors(cudaMalloc((void **)&d_StockPrice, OPT_SZ)); - checkCudaErrors(cudaMalloc((void **)&d_OptionStrike, OPT_SZ)); - checkCudaErrors(cudaMalloc((void **)&d_OptionYears, OPT_SZ)); +int main(int argc, char** argv) { + float* h_CallResultCPU = (float*)malloc(OPT_SZ); + float* h_PutResultCPU = (float*)malloc(OPT_SZ); + float* h_CallResultGPU = (float*)malloc(OPT_SZ); + float* h_PutResultGPU = (float*)malloc(OPT_SZ); + float* h_StockPrice = (float*)malloc(OPT_SZ); + float* h_OptionStrike = (float*)malloc(OPT_SZ); + float* h_OptionYears = (float*)malloc(OPT_SZ); - printf("...generating input data in CPU mem.\n"); srand(5347); // Generate options set - for (i = 0; i < OPT_N; i++) { + for (int i = 0; i < OPT_N; i++) { h_CallResultCPU[i] = 0.0f; h_PutResultCPU[i] = -1.0f; h_StockPrice[i] = RandFloat(5.0f, 30.0f); @@ -131,93 +140,31 @@ int main(int argc, char **argv) { h_OptionYears[i] = RandFloat(0.25f, 10.0f); } - printf("...copying input data to GPU mem.\n"); - // Copy options data to GPU memory for further processing - checkCudaErrors( - cudaMemcpy(d_StockPrice, h_StockPrice, OPT_SZ, cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(d_OptionStrike, h_OptionStrike, OPT_SZ, - cudaMemcpyHostToDevice)); - checkCudaErrors( - cudaMemcpy(d_OptionYears, h_OptionYears, OPT_SZ, cudaMemcpyHostToDevice)); - printf("Data init done.\n\n"); - - printf("Executing Black-Scholes GPU kernel (%i iterations)...\n", - NUM_ITERATIONS); - checkCudaErrors(cudaDeviceSynchronize()); - sdkResetTimer(&hTimer); - sdkStartTimer(&hTimer); - - for (i = 0; i < NUM_ITERATIONS; i++) { - BlackScholesGPU<<>>( - (float2 *)d_CallResult, (float2 *)d_PutResult, (float2 *)d_StockPrice, - (float2 *)d_OptionStrike, (float2 *)d_OptionYears, RISKFREE, VOLATILITY, - OPT_N); - getLastCudaError("BlackScholesGPU() execution failed\n"); - } - - checkCudaErrors(cudaDeviceSynchronize()); - sdkStopTimer(&hTimer); - gpuTime = sdkGetTimerValue(&hTimer) / NUM_ITERATIONS; - - // Both call and put is calculated - printf("Options count : %i \n", 2 * OPT_N); - printf("BlackScholesGPU() time : %f msec\n", gpuTime); - printf("Effective memory bandwidth: %f GB/s\n", - ((double)(5 * OPT_N * sizeof(float)) * 1E-9) / (gpuTime * 1E-3)); - printf("Gigaoptions per second : %f \n\n", - ((double)(2 * OPT_N) * 1E-9) / (gpuTime * 1E-3)); - - printf( - "BlackScholes, Throughput = %.4f GOptions/s, Time = %.5f s, Size = %u " - "options, NumDevsUsed = %u, Workgroup = %u\n", - (((double)(2.0 * OPT_N) * 1.0E-9) / (gpuTime * 1.0E-3)), gpuTime * 1e-3, - (2 * OPT_N), 1, 128); - - printf("\nReading back GPU results...\n"); - // Read back GPU results to compare them to CPU results - checkCudaErrors(cudaMemcpy(h_CallResultGPU, d_CallResult, OPT_SZ, - cudaMemcpyDeviceToHost)); - checkCudaErrors( - cudaMemcpy(h_PutResultGPU, d_PutResult, OPT_SZ, cudaMemcpyDeviceToHost)); + auto callGrad = clad::gradient( + launch, "h_CallResultGPU, h_StockPrice, h_OptionStrike, h_OptionYears"); + auto putGrad = clad::gradient( + launch, "h_PutResultGPU, h_StockPrice, h_OptionStrike, h_OptionYears"); - printf("Checking the results...\n"); - printf("...running CPU calculations.\n\n"); - // Calculate options values on CPU - BlackScholesCPU(h_CallResultCPU, h_PutResultCPU, h_StockPrice, h_OptionStrike, - h_OptionYears, RISKFREE, VOLATILITY, OPT_N); - - printf("Comparing the results...\n"); // Calculate max absolute difference and L1 distance // between CPU and GPU results + float delta, ref, sum_delta, sum_ref, max_delta, L1norm; sum_delta = 0; sum_ref = 0; max_delta = 0; - for (i = 0; i < OPT_N; i++) { + for (int i = 0; i < OPT_N; i++) { ref = h_CallResultCPU[i]; - delta = fabs(h_CallResultCPU[i] - h_CallResultGPU[i]); + delta = fabsf(h_CallResultCPU[i] - h_CallResultGPU[i]); - if (delta > max_delta) { + if (delta > max_delta) max_delta = delta; - } sum_delta += delta; - sum_ref += fabs(ref); + sum_ref += fabsf(ref); } L1norm = sum_delta / sum_ref; - printf("L1 norm: %E\n", L1norm); - printf("Max absolute error: %E\n\n", max_delta); - - printf("Shutting down...\n"); - printf("...releasing GPU memory.\n"); - checkCudaErrors(cudaFree(d_OptionYears)); - checkCudaErrors(cudaFree(d_OptionStrike)); - checkCudaErrors(cudaFree(d_StockPrice)); - checkCudaErrors(cudaFree(d_PutResult)); - checkCudaErrors(cudaFree(d_CallResult)); - - printf("...releasing CPU memory.\n"); + free(h_OptionYears); free(h_OptionStrike); free(h_StockPrice); @@ -225,19 +172,11 @@ int main(int argc, char **argv) { free(h_CallResultGPU); free(h_PutResultCPU); free(h_CallResultCPU); - sdkDeleteTimer(&hTimer); - printf("Shutdown done.\n"); - - printf("\n[BlackScholes] - Test Summary\n"); if (L1norm > 1e-6) { - printf("Test failed!\n"); - exit(EXIT_FAILURE); + printf("Original test failed\n"); + return EXIT_FAILURE; } - printf( - "\nNOTE: The CUDA Samples are not meant for performance measurements. " - "Results may vary when GPU Boost is enabled.\n\n"); - printf("Test passed\n"); - exit(EXIT_SUCCESS); + return EXIT_SUCCESS; } diff --git a/demos/CUDA/BlackScholes/BlackScholes_kernel.cuh b/demos/CUDA/BlackScholes/BlackScholes_kernel.cuh index 9c49d49e4..dfcf5c575 100644 --- a/demos/CUDA/BlackScholes/BlackScholes_kernel.cuh +++ b/demos/CUDA/BlackScholes/BlackScholes_kernel.cuh @@ -36,9 +36,9 @@ __device__ inline float cndGPU(float d) { const float A5 = 1.330274429f; const float RSQRT2PI = 0.39894228040143267793994605993438f; - float K = __fdividef(1.0f, (1.0f + 0.2316419f * fabsf(d))); + float K = fdividef(1.0f, (1.0f + 0.2316419f * fabsf(d))); - float cnd = RSQRT2PI * __expf(-0.5f * d * d) * + float cnd = RSQRT2PI * expf(-0.5f * d * d) * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))); if (d > 0) cnd = 1.0f - cnd; @@ -59,15 +59,15 @@ __device__ inline void BlackScholesBodyGPU(float &CallResult, float &PutResult, float sqrtT, expRT; float d1, d2, CNDD1, CNDD2; - sqrtT = __fdividef(1.0F, rsqrtf(T)); - d1 = __fdividef(__logf(S / X) + (R + 0.5f * V * V) * T, V * sqrtT); + sqrtT = fdividef(1.0F, 1.0 / sqrtf(T)); + d1 = fdividef(logf(S / X) + (R + 0.5f * V * V) * T, V * sqrtT); d2 = d1 - V * sqrtT; CNDD1 = cndGPU(d1); CNDD2 = cndGPU(d2); // Calculate Call and Put simultaneously - expRT = __expf(-R * T); + expRT = expf(-R * T); CallResult = S * CNDD1 - X * expRT * CNDD2; PutResult = X * expRT * (1.0f - CNDD2) - S * (1.0f - CNDD1); } @@ -75,13 +75,12 @@ __device__ inline void BlackScholesBodyGPU(float &CallResult, float &PutResult, //////////////////////////////////////////////////////////////////////////////// // Process an array of optN options on GPU //////////////////////////////////////////////////////////////////////////////// -__launch_bounds__(128) __global__ - void BlackScholesGPU(float2 *__restrict d_CallResult, - float2 *__restrict d_PutResult, - float2 *__restrict d_StockPrice, - float2 *__restrict d_OptionStrike, - float2 *__restrict d_OptionYears, float Riskfree, - float Volatility, int optN) { +__global__ void BlackScholesGPU(float2* __restrict d_CallResult, + float2* __restrict d_PutResult, + float2* __restrict d_StockPrice, + float2* __restrict d_OptionStrike, + float2* __restrict d_OptionYears, + float Riskfree, float Volatility, int optN) { ////Thread index // const int tid = blockDim.x * blockIdx.x + threadIdx.x; ////Total number of threads in execution grid @@ -100,7 +99,9 @@ __launch_bounds__(128) __global__ BlackScholesBodyGPU(callResult2, putResult2, d_StockPrice[opt].y, d_OptionStrike[opt].y, d_OptionYears[opt].y, Riskfree, Volatility); - d_CallResult[opt] = make_float2(callResult1, callResult2); - d_PutResult[opt] = make_float2(putResult1, putResult2); + d_CallResult[opt].x = callResult1; + d_CallResult[opt].y = callResult2; + d_PutResult[opt].x = putResult1; + d_PutResult[opt].y = putResult2; } } diff --git a/demos/CUDA/TensorContraction.cu b/demos/CUDA/TensorContraction.cu index a1adc48bc..8e1efaf9b 100644 --- a/demos/CUDA/TensorContraction.cu +++ b/demos/CUDA/TensorContraction.cu @@ -13,7 +13,6 @@ // -L/path/to/cuda/lib64 -lcudart_static -ldl -lrt -pthread -lm -lstdc++ // RUN: ./TensorContraction -#include "cuda_runtime_api.h" #include "clad/Differentiator/Differentiator.h" typedef unsigned long long int size_type; diff --git a/include/clad/Differentiator/BuiltinDerivatives.h b/include/clad/Differentiator/BuiltinDerivatives.h index 62296ab92..8b8b4e09a 100644 --- a/include/clad/Differentiator/BuiltinDerivatives.h +++ b/include/clad/Differentiator/BuiltinDerivatives.h @@ -386,6 +386,28 @@ inline void free_pushforward(void* ptr, void* d_ptr) { // NOLINTEND(cppcoreguidelines-owning-memory) // NOLINTEND(cppcoreguidelines-no-malloc) +CUDA_HOST_DEVICE inline void expf_pullback(float a, float d_y, float* d_a) { + *d_a += expf(a) * d_y; +} + +CUDA_HOST_DEVICE inline void fabsf_pullback(float a, float d_y, float* d_a) { + *d_a += (a >= 0) ? d_y : -d_y; +} + +CUDA_HOST_DEVICE inline void logf_pullback(float a, float d_y, float* d_a) { + *d_a += (1.F / a) * d_y; +} + +CUDA_HOST_DEVICE inline void fdividef_pullback(float a, float b, float d_y, + float* d_a, float* d_b) { + *d_a += (1.F / b) * d_y; + *d_b += (-a / (b * b)) * d_y; +} + +CUDA_HOST_DEVICE inline void sqrtf_pullback(float a, float d_y, float* d_a) { + *d_a += (1.F / (2.F * sqrtf(a))) * d_y; +} + // These are required because C variants of mathematical functions are // defined in global namespace. using std::abs_pushforward;