Skip to content

Commit

Permalink
Use gradient on BlackScholes demo
Browse files Browse the repository at this point in the history
  • Loading branch information
kchristin22 committed Nov 13, 2024
1 parent c3ccc8f commit cab90ed
Show file tree
Hide file tree
Showing 4 changed files with 96 additions and 135 deletions.
179 changes: 59 additions & 120 deletions demos/CUDA/BlackScholes/BlackScholes.cu
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
* See supplied whitepaper for more explanations.
*/

#include "clad/Differentiator/Differentiator.h"

#include <helper_functions.h> // helper functions for string parsing
#include <helper_cuda.h> // helper functions CUDA error checking and initialization

Expand Down Expand Up @@ -71,173 +73,110 @@ const float VOLATILITY = 0.30f;
////////////////////////////////////////////////////////////////////////////////
// Main program
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) {
// Start logs
printf("[%s] - Starting...\n", argv[0]);

//'h_' prefix - CPU (host) memory space
float
// Results calculated by CPU for reference
*h_CallResultCPU,
*h_PutResultCPU,
// CPU copy of GPU results
*h_CallResultGPU, *h_PutResultGPU,
// CPU instance of input data
*h_StockPrice, *h_OptionStrike, *h_OptionYears;
void launch(float* h_CallResultCPU, float* h_CallResultGPU,
float* h_PutResultCPU, float* h_PutResultGPU, float* h_StockPrice,
float* h_OptionStrike, float* h_OptionYears) {

//'d_' prefix - GPU (device) memory space
float
// Results calculated by GPU
*d_CallResult,
*d_PutResult,
*d_CallResult = nullptr,
*d_PutResult = nullptr,
// GPU instance of input data
*d_StockPrice, *d_OptionStrike, *d_OptionYears;
*d_StockPrice = nullptr, *d_OptionStrike = nullptr,
*d_OptionYears = nullptr;

cudaMalloc((void**)&d_CallResult, OPT_SZ);
cudaMalloc((void**)&d_PutResult, OPT_SZ);
cudaMalloc((void**)&d_StockPrice, OPT_SZ);
cudaMalloc((void**)&d_OptionStrike, OPT_SZ);
cudaMalloc((void**)&d_OptionYears, OPT_SZ);

double delta, ref, sum_delta, sum_ref, max_delta, L1norm, gpuTime;
// Copy options data to GPU memory for further processing
cudaMemcpy(d_StockPrice, h_StockPrice, OPT_SZ, cudaMemcpyHostToDevice);
cudaMemcpy(d_OptionStrike, h_OptionStrike, OPT_SZ, cudaMemcpyHostToDevice);
cudaMemcpy(d_OptionYears, h_OptionYears, OPT_SZ, cudaMemcpyHostToDevice);

BlackScholesGPU<<<DIV_UP((OPT_N / 2), 128), 128 /*480, 128*/>>>(
(float2*)d_CallResult, (float2*)d_PutResult, (float2*)d_StockPrice,
(float2*)d_OptionStrike, (float2*)d_OptionYears, RISKFREE, VOLATILITY,
OPT_N);

StopWatchInterface *hTimer = NULL;
int i;
// Both call and put is calculated

findCudaDevice(argc, (const char **)argv);
// Read back GPU results to compare them to CPU results
cudaMemcpy(h_CallResultGPU, d_CallResult, OPT_SZ, cudaMemcpyDeviceToHost);
cudaMemcpy(h_PutResultGPU, d_PutResult, OPT_SZ, cudaMemcpyDeviceToHost);

sdkCreateTimer(&hTimer);
// Calculate options values on CPU
BlackScholesCPU(h_CallResultCPU, h_PutResultCPU, h_StockPrice, h_OptionStrike,
h_OptionYears, RISKFREE, VOLATILITY, OPT_N);

printf("Initializing data...\n");
printf("...allocating CPU memory for options.\n");
h_CallResultCPU = (float *)malloc(OPT_SZ);
h_PutResultCPU = (float *)malloc(OPT_SZ);
h_CallResultGPU = (float *)malloc(OPT_SZ);
h_PutResultGPU = (float *)malloc(OPT_SZ);
h_StockPrice = (float *)malloc(OPT_SZ);
h_OptionStrike = (float *)malloc(OPT_SZ);
h_OptionYears = (float *)malloc(OPT_SZ);
cudaFree(d_OptionYears);
cudaFree(d_OptionStrike);
cudaFree(d_StockPrice);
cudaFree(d_PutResult);
cudaFree(d_CallResult);
}

printf("...allocating GPU memory for options.\n");
checkCudaErrors(cudaMalloc((void **)&d_CallResult, OPT_SZ));
checkCudaErrors(cudaMalloc((void **)&d_PutResult, OPT_SZ));
checkCudaErrors(cudaMalloc((void **)&d_StockPrice, OPT_SZ));
checkCudaErrors(cudaMalloc((void **)&d_OptionStrike, OPT_SZ));
checkCudaErrors(cudaMalloc((void **)&d_OptionYears, OPT_SZ));
int main(int argc, char** argv) {
float* h_CallResultCPU = (float*)malloc(OPT_SZ);
float* h_PutResultCPU = (float*)malloc(OPT_SZ);
float* h_CallResultGPU = (float*)malloc(OPT_SZ);
float* h_PutResultGPU = (float*)malloc(OPT_SZ);
float* h_StockPrice = (float*)malloc(OPT_SZ);
float* h_OptionStrike = (float*)malloc(OPT_SZ);
float* h_OptionYears = (float*)malloc(OPT_SZ);

printf("...generating input data in CPU mem.\n");
srand(5347);

// Generate options set
for (i = 0; i < OPT_N; i++) {
for (int i = 0; i < OPT_N; i++) {
h_CallResultCPU[i] = 0.0f;
h_PutResultCPU[i] = -1.0f;
h_StockPrice[i] = RandFloat(5.0f, 30.0f);
h_OptionStrike[i] = RandFloat(1.0f, 100.0f);
h_OptionYears[i] = RandFloat(0.25f, 10.0f);
}

printf("...copying input data to GPU mem.\n");
// Copy options data to GPU memory for further processing
checkCudaErrors(
cudaMemcpy(d_StockPrice, h_StockPrice, OPT_SZ, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_OptionStrike, h_OptionStrike, OPT_SZ,
cudaMemcpyHostToDevice));
checkCudaErrors(
cudaMemcpy(d_OptionYears, h_OptionYears, OPT_SZ, cudaMemcpyHostToDevice));
printf("Data init done.\n\n");

printf("Executing Black-Scholes GPU kernel (%i iterations)...\n",
NUM_ITERATIONS);
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);

for (i = 0; i < NUM_ITERATIONS; i++) {
BlackScholesGPU<<<DIV_UP((OPT_N / 2), 128), 128 /*480, 128*/>>>(
(float2 *)d_CallResult, (float2 *)d_PutResult, (float2 *)d_StockPrice,
(float2 *)d_OptionStrike, (float2 *)d_OptionYears, RISKFREE, VOLATILITY,
OPT_N);
getLastCudaError("BlackScholesGPU() execution failed\n");
}

checkCudaErrors(cudaDeviceSynchronize());
sdkStopTimer(&hTimer);
gpuTime = sdkGetTimerValue(&hTimer) / NUM_ITERATIONS;

// Both call and put is calculated
printf("Options count : %i \n", 2 * OPT_N);
printf("BlackScholesGPU() time : %f msec\n", gpuTime);
printf("Effective memory bandwidth: %f GB/s\n",
((double)(5 * OPT_N * sizeof(float)) * 1E-9) / (gpuTime * 1E-3));
printf("Gigaoptions per second : %f \n\n",
((double)(2 * OPT_N) * 1E-9) / (gpuTime * 1E-3));

printf(
"BlackScholes, Throughput = %.4f GOptions/s, Time = %.5f s, Size = %u "
"options, NumDevsUsed = %u, Workgroup = %u\n",
(((double)(2.0 * OPT_N) * 1.0E-9) / (gpuTime * 1.0E-3)), gpuTime * 1e-3,
(2 * OPT_N), 1, 128);

printf("\nReading back GPU results...\n");
// Read back GPU results to compare them to CPU results
checkCudaErrors(cudaMemcpy(h_CallResultGPU, d_CallResult, OPT_SZ,
cudaMemcpyDeviceToHost));
checkCudaErrors(
cudaMemcpy(h_PutResultGPU, d_PutResult, OPT_SZ, cudaMemcpyDeviceToHost));
auto callGrad = clad::gradient(
launch, "h_CallResultGPU, h_StockPrice, h_OptionStrike, h_OptionYears");
auto putGrad = clad::gradient(
launch, "h_PutResultGPU, h_StockPrice, h_OptionStrike, h_OptionYears");

printf("Checking the results...\n");
printf("...running CPU calculations.\n\n");
// Calculate options values on CPU
BlackScholesCPU(h_CallResultCPU, h_PutResultCPU, h_StockPrice, h_OptionStrike,
h_OptionYears, RISKFREE, VOLATILITY, OPT_N);

printf("Comparing the results...\n");
// Calculate max absolute difference and L1 distance
// between CPU and GPU results
float delta, ref, sum_delta, sum_ref, max_delta, L1norm;
sum_delta = 0;
sum_ref = 0;
max_delta = 0;

for (i = 0; i < OPT_N; i++) {
for (int i = 0; i < OPT_N; i++) {
ref = h_CallResultCPU[i];
delta = fabs(h_CallResultCPU[i] - h_CallResultGPU[i]);
delta = fabsf(h_CallResultCPU[i] - h_CallResultGPU[i]);

if (delta > max_delta) {
if (delta > max_delta)
max_delta = delta;
}

sum_delta += delta;
sum_ref += fabs(ref);
sum_ref += fabsf(ref);
}

L1norm = sum_delta / sum_ref;
printf("L1 norm: %E\n", L1norm);
printf("Max absolute error: %E\n\n", max_delta);

printf("Shutting down...\n");
printf("...releasing GPU memory.\n");
checkCudaErrors(cudaFree(d_OptionYears));
checkCudaErrors(cudaFree(d_OptionStrike));
checkCudaErrors(cudaFree(d_StockPrice));
checkCudaErrors(cudaFree(d_PutResult));
checkCudaErrors(cudaFree(d_CallResult));

printf("...releasing CPU memory.\n");

free(h_OptionYears);
free(h_OptionStrike);
free(h_StockPrice);
free(h_PutResultGPU);
free(h_CallResultGPU);
free(h_PutResultCPU);
free(h_CallResultCPU);
sdkDeleteTimer(&hTimer);
printf("Shutdown done.\n");

printf("\n[BlackScholes] - Test Summary\n");

if (L1norm > 1e-6) {
printf("Test failed!\n");
exit(EXIT_FAILURE);
printf("Original test failed\n");
return EXIT_FAILURE;
}

printf(
"\nNOTE: The CUDA Samples are not meant for performance measurements. "
"Results may vary when GPU Boost is enabled.\n\n");
printf("Test passed\n");
exit(EXIT_SUCCESS);
return EXIT_SUCCESS;
}
29 changes: 15 additions & 14 deletions demos/CUDA/BlackScholes/BlackScholes_kernel.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,9 @@ __device__ inline float cndGPU(float d) {
const float A5 = 1.330274429f;
const float RSQRT2PI = 0.39894228040143267793994605993438f;

float K = __fdividef(1.0f, (1.0f + 0.2316419f * fabsf(d)));
float K = fdividef(1.0f, (1.0f + 0.2316419f * fabsf(d)));

float cnd = RSQRT2PI * __expf(-0.5f * d * d) *
float cnd = RSQRT2PI * expf(-0.5f * d * d) *
(K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))));

if (d > 0) cnd = 1.0f - cnd;
Expand All @@ -59,29 +59,28 @@ __device__ inline void BlackScholesBodyGPU(float &CallResult, float &PutResult,
float sqrtT, expRT;
float d1, d2, CNDD1, CNDD2;

sqrtT = __fdividef(1.0F, rsqrtf(T));
d1 = __fdividef(__logf(S / X) + (R + 0.5f * V * V) * T, V * sqrtT);
sqrtT = fdividef(1.0F, 1.0 / sqrtf(T));
d1 = fdividef(logf(S / X) + (R + 0.5f * V * V) * T, V * sqrtT);
d2 = d1 - V * sqrtT;

CNDD1 = cndGPU(d1);
CNDD2 = cndGPU(d2);

// Calculate Call and Put simultaneously
expRT = __expf(-R * T);
expRT = expf(-R * T);
CallResult = S * CNDD1 - X * expRT * CNDD2;
PutResult = X * expRT * (1.0f - CNDD2) - S * (1.0f - CNDD1);
}

////////////////////////////////////////////////////////////////////////////////
// Process an array of optN options on GPU
////////////////////////////////////////////////////////////////////////////////
__launch_bounds__(128) __global__
void BlackScholesGPU(float2 *__restrict d_CallResult,
float2 *__restrict d_PutResult,
float2 *__restrict d_StockPrice,
float2 *__restrict d_OptionStrike,
float2 *__restrict d_OptionYears, float Riskfree,
float Volatility, int optN) {
__global__ void BlackScholesGPU(float2* __restrict d_CallResult,
float2* __restrict d_PutResult,
float2* __restrict d_StockPrice,
float2* __restrict d_OptionStrike,
float2* __restrict d_OptionYears,
float Riskfree, float Volatility, int optN) {
////Thread index
// const int tid = blockDim.x * blockIdx.x + threadIdx.x;
////Total number of threads in execution grid
Expand All @@ -100,7 +99,9 @@ __launch_bounds__(128) __global__
BlackScholesBodyGPU(callResult2, putResult2, d_StockPrice[opt].y,
d_OptionStrike[opt].y, d_OptionYears[opt].y, Riskfree,
Volatility);
d_CallResult[opt] = make_float2(callResult1, callResult2);
d_PutResult[opt] = make_float2(putResult1, putResult2);
d_CallResult[opt].x = callResult1;
d_CallResult[opt].y = callResult2;
d_PutResult[opt].x = putResult1;
d_PutResult[opt].y = putResult2;
}
}
1 change: 0 additions & 1 deletion demos/CUDA/TensorContraction.cu
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
// -L/path/to/cuda/lib64 -lcudart_static -ldl -lrt -pthread -lm -lstdc++
// RUN: ./TensorContraction

#include "cuda_runtime_api.h"
#include "clad/Differentiator/Differentiator.h"

typedef unsigned long long int size_type;
Expand Down
22 changes: 22 additions & 0 deletions include/clad/Differentiator/BuiltinDerivatives.h
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,28 @@ inline void free_pushforward(void* ptr, void* d_ptr) {
// NOLINTEND(cppcoreguidelines-owning-memory)
// NOLINTEND(cppcoreguidelines-no-malloc)

CUDA_HOST_DEVICE inline void expf_pullback(float a, float d_y, float* d_a) {
*d_a += expf(a) * d_y;
}

CUDA_HOST_DEVICE inline void fabsf_pullback(float a, float d_y, float* d_a) {
*d_a += (a >= 0) ? d_y : -d_y;
}

CUDA_HOST_DEVICE inline void logf_pullback(float a, float d_y, float* d_a) {
*d_a += (1.F / a) * d_y;
}

CUDA_HOST_DEVICE inline void fdividef_pullback(float a, float b, float d_y,
float* d_a, float* d_b) {
*d_a += (1.F / b) * d_y;
*d_b += (-a / (b * b)) * d_y;
}

CUDA_HOST_DEVICE inline void sqrtf_pullback(float a, float d_y, float* d_a) {
*d_a += (1.F / (2.F * sqrtf(a))) * d_y;
}

// These are required because C variants of mathematical functions are
// defined in global namespace.
using std::abs_pushforward;
Expand Down

0 comments on commit cab90ed

Please sign in to comment.