From c5496ace44fb16f9e713fb5dd53e69b194003e58 Mon Sep 17 00:00:00 2001 From: Qianqian Fang Date: Tue, 10 Oct 2023 16:14:14 -0400 Subject: [PATCH] force invcdf/angleinvcdf even count of float, reapply 53d7ac0, fix #193 --- src/mcx_core.cu | 31 ++++++++++++++++--------------- src/mcx_core.h | 2 ++ src/mcx_utils.c | 2 +- 3 files changed, 19 insertions(+), 16 deletions(-) diff --git a/src/mcx_core.cu b/src/mcx_core.cu index ef9565f4..ea17a919 100644 --- a/src/mcx_core.cu +++ b/src/mcx_core.cu @@ -1416,12 +1416,12 @@ __device__ inline int launchnewphoton(MCXpos* p, MCXdir* v, Stokes* s, MCXtime* if (gcfg->c0.w > 0.f) { // if focal-length > 0, no interpolation, just read the angleinvcdf value ang = fminf(rand_uniform01(t) * gcfg->nangle, gcfg->nangle - EPS); - cphi = ((float*)(sharedmem))[(int)(ang) + gcfg->nphase]; + cphi = ((float*)(sharedmem))[(int)(ang) + gcfg->nphaselen]; } else { // odd number length, interpolate between neigboring values ang = fminf(rand_uniform01(t) * (gcfg->nangle - 1), gcfg->nangle - 1 - EPS); sphi = ang - ((int)ang); - cphi = ((1.f - sphi) * (((float*)(sharedmem))[((int)ang >= gcfg->nangle - 1 ? gcfg->nangle - 1 : (int)(ang)) + gcfg->nphase]) + - sphi * (((float*)(sharedmem))[((int)ang + 1 >= gcfg->nangle - 1 ? gcfg->nangle - 1 : (int)(ang) + 1) + gcfg->nphase])); + cphi = ((1.f - sphi) * (((float*)(sharedmem))[((int)ang >= gcfg->nangle - 1 ? gcfg->nangle - 1 : (int)(ang)) + gcfg->nphaselen]) + + sphi * (((float*)(sharedmem))[((int)ang + 1 >= gcfg->nangle - 1 ? gcfg->nangle - 1 : (int)(ang) + 1) + gcfg->nphaselen])); } cphi *= ONE_PI; // next zenith angle computed based on angleinvcdf @@ -1651,12 +1651,12 @@ __global__ void mcx_main_loop(uint media[], OutputType field[], float genergy[], idx1d = gcfg->nangle / blockDim.x; for (idx1dold = 0; idx1dold < idx1d; idx1dold++) { - ppath[threadIdx.x * idx1d + idx1dold + gcfg->nphase] = gangleinvcdf[threadIdx.x * idx1d + idx1dold]; + ppath[threadIdx.x * idx1d + idx1dold + gcfg->nphaselen] = gangleinvcdf[threadIdx.x * idx1d + idx1dold]; } if (gcfg->nangle - (idx1d * blockDim.x) > 0 && threadIdx.x == 0) { for (idx1dold = 0; idx1dold < gcfg->nangle - (idx1d * blockDim.x) ; idx1dold++) { - ppath[blockDim.x * idx1d + idx1dold + gcfg->nphase] = gangleinvcdf[blockDim.x * idx1d + idx1dold]; + ppath[blockDim.x * idx1d + idx1dold + gcfg->nphaselen] = gangleinvcdf[blockDim.x * idx1d + idx1dold]; } } @@ -1667,7 +1667,7 @@ __global__ void mcx_main_loop(uint media[], OutputType field[], float genergy[], return; } - ppath = (float*)(sharedmem + sizeof(float) * (gcfg->nphase + gcfg->nangle) + blockDim.x * (gcfg->issaveseed * RAND_BUF_LEN * sizeof(RandType))); + ppath = (float*)(sharedmem + sizeof(float) * (gcfg->nphaselen + gcfg->nanglelen) + blockDim.x * (gcfg->issaveseed * RAND_BUF_LEN * sizeof(RandType))); ppath += threadIdx.x * (gcfg->w0offset + gcfg->srcnum + 2 * (gcfg->outputtype == otRF)); // block#2: maxmedia*thread number to store the partial clearpath(ppath, gcfg->w0offset + gcfg->srcnum); ppath[gcfg->partialdata] = genergy[idx << 1]; @@ -1686,7 +1686,7 @@ __global__ void mcx_main_loop(uint media[], OutputType field[], float genergy[], */ if (launchnewphoton(&p, &v, &s, &f, &rv, flipdir, &prop, &idx1d, field, &mediaid, &w0, 0, ppath, - n_det, detectedphoton, t, (RandType*)(sharedmem + sizeof(float) * (gcfg->nphase + gcfg->nangle) + threadIdx.x * gcfg->issaveseed * RAND_BUF_LEN * sizeof(RandType)), media, srcpattern, + n_det, detectedphoton, t, (RandType*)(sharedmem + sizeof(float) * (gcfg->nphaselen + gcfg->nanglelen) + threadIdx.x * gcfg->issaveseed * RAND_BUF_LEN * sizeof(RandType)), media, srcpattern, idx, (RandType*)n_seed, seeddata, gdebugdata, gprogress, photontof, &nuvox)) { GPUDEBUG(("thread %d: fail to launch photon\n", idx)); n_pos[idx] = *((float4*)(&p)); @@ -2136,7 +2136,7 @@ __global__ void mcx_main_loop(uint media[], OutputType field[], float genergy[], if (launchnewphoton(&p, &v, &s, &f, &rv, flipdir, &prop, &idx1d, field, &mediaid, &w0, (((idx1d == OUTSIDE_VOLUME_MAX && gcfg->bc[9 + flipdir[3]]) || (idx1d == OUTSIDE_VOLUME_MIN && gcfg->bc[6 + flipdir[3]])) ? OUTSIDE_VOLUME_MIN : (mediaidold & DET_MASK)), - ppath, n_det, detectedphoton, t, (RandType*)(sharedmem + sizeof(float) * (gcfg->nphase + gcfg->nangle) + threadIdx.x * gcfg->issaveseed * RAND_BUF_LEN * sizeof(RandType)), + ppath, n_det, detectedphoton, t, (RandType*)(sharedmem + sizeof(float) * (gcfg->nphaselen + gcfg->nanglelen) + threadIdx.x * gcfg->issaveseed * RAND_BUF_LEN * sizeof(RandType)), media, srcpattern, idx, (RandType*)n_seed, seeddata, gdebugdata, gprogress, photontof, &nuvox)) { break; } @@ -2159,7 +2159,7 @@ __global__ void mcx_main_loop(uint media[], OutputType field[], float genergy[], GPUDEBUG(("relaunch after Russian roulette at idx=[%d] mediaid=[%d], ref=[%d]\n", idx1d, mediaid, gcfg->doreflect)); if (launchnewphoton(&p, &v, &s, &f, &rv, flipdir, &prop, &idx1d, field, &mediaid, &w0, (mediaidold & DET_MASK), ppath, - n_det, detectedphoton, t, (RandType*)(sharedmem + sizeof(float) * (gcfg->nphase + gcfg->nangle) + threadIdx.x * gcfg->issaveseed * RAND_BUF_LEN * sizeof(RandType)), + n_det, detectedphoton, t, (RandType*)(sharedmem + sizeof(float) * (gcfg->nphaselen + gcfg->nanglelen) + threadIdx.x * gcfg->issaveseed * RAND_BUF_LEN * sizeof(RandType)), media, srcpattern, idx, (RandType*)n_seed, seeddata, gdebugdata, gprogress, photontof, &nuvox)) { break; } @@ -2187,7 +2187,7 @@ __global__ void mcx_main_loop(uint media[], OutputType field[], float genergy[], if (reflectray(n1, (float3*) & (v), &rv, &nuvox, &prop, t)) { // true if photon transmits to background media if (launchnewphoton(&p, &v, &s, &f, &rv, flipdir, &prop, &idx1d, field, &mediaid, &w0, (mediaidold & DET_MASK), - ppath, n_det, detectedphoton, t, (RandType*)(sharedmem + sizeof(float) * (gcfg->nphase + gcfg->nangle) + threadIdx.x * gcfg->issaveseed * RAND_BUF_LEN * sizeof(RandType)), + ppath, n_det, detectedphoton, t, (RandType*)(sharedmem + sizeof(float) * (gcfg->nphaselen + gcfg->nanglelen) + threadIdx.x * gcfg->issaveseed * RAND_BUF_LEN * sizeof(RandType)), media, srcpattern, idx, (RandType*)n_seed, seeddata, gdebugdata, gprogress, photontof, &nuvox)) { break; } @@ -2240,7 +2240,7 @@ __global__ void mcx_main_loop(uint media[], OutputType field[], float genergy[], if (launchnewphoton(&p, &v, &s, &f, &rv, flipdir, &prop, &idx1d, field, &mediaid, &w0, (((idx1d == OUTSIDE_VOLUME_MAX && gcfg->bc[9 + flipdir[3]]) || (idx1d == OUTSIDE_VOLUME_MIN && gcfg->bc[6 + flipdir[3]])) ? OUTSIDE_VOLUME_MIN : (mediaidold & DET_MASK)), - ppath, n_det, detectedphoton, t, (RandType*)(sharedmem + sizeof(float) * (gcfg->nphase + gcfg->nangle) + threadIdx.x * gcfg->issaveseed * RAND_BUF_LEN * sizeof(RandType)), + ppath, n_det, detectedphoton, t, (RandType*)(sharedmem + sizeof(float) * (gcfg->nphaselen + gcfg->nanglelen) + threadIdx.x * gcfg->issaveseed * RAND_BUF_LEN * sizeof(RandType)), media, srcpattern, idx, (RandType*)n_seed, seeddata, gdebugdata, gprogress, photontof, &nuvox)) { break; } @@ -2274,7 +2274,7 @@ __global__ void mcx_main_loop(uint media[], OutputType field[], float genergy[], if (issvmc && (nuvox.sv.isupper ? nuvox.sv.upper : nuvox.sv.lower) == 0) { // terminate photon if photon is reflected to background medium if (launchnewphoton(&p, &v, &s, &f, &rv, flipdir, &prop, &idx1d, field, &mediaid, &w0, (mediaidold & DET_MASK), - ppath, n_det, detectedphoton, t, (RandType*)(sharedmem + sizeof(float) * (gcfg->nphase + gcfg->nangle) + threadIdx.x * gcfg->issaveseed * RAND_BUF_LEN * sizeof(RandType)), + ppath, n_det, detectedphoton, t, (RandType*)(sharedmem + sizeof(float) * (gcfg->nphaselen + gcfg->nanglelen) + threadIdx.x * gcfg->issaveseed * RAND_BUF_LEN * sizeof(RandType)), media, srcpattern, idx, (RandType*)n_seed, seeddata, gdebugdata, gprogress, photontof, &nuvox)) { break; } @@ -2637,7 +2637,8 @@ void mcx_run_simulation(Config* cfg, GPUInfo* gpu) { (uint)cfg->maxvoidstep, cfg->issaveseed > 0, (uint)cfg->issaveref, cfg->isspecular > 0, cfg->maxdetphoton * hostdetreclen, cfg->seed, (uint)cfg->outputtype, 0, 0, cfg->faststep, cfg->debuglevel, cfg->savedetflag, hostdetreclen, partialdata, w0offset, cfg->mediabyte, - (uint)cfg->maxjumpdebug, cfg->gscatter, is2d, cfg->replaydet, cfg->srcnum, cfg->nphase, cfg->nangle, cfg->omega + (uint)cfg->maxjumpdebug, cfg->gscatter, is2d, cfg->replaydet, cfg->srcnum, + cfg->nphase, cfg->nphase + (cfg->nphase & 0x1), cfg->nangle, cfg->nangle + (cfg->nangle & 0x1), cfg->omega }; if (param.isatomic) { @@ -3099,7 +3100,7 @@ void mcx_run_simulation(Config* cfg, GPUInfo* gpu) { * * The calculation of the energy conservation will only reflect the last simulation. */ - sharedbuf = (cfg->nphase + cfg->nangle) * sizeof(float) + gpu[gpuid].autoblock * (cfg->issaveseed * (RAND_BUF_LEN * sizeof(RandType)) + sizeof(float) * (param.w0offset + cfg->srcnum + 2 * (cfg->outputtype == otRF))); + sharedbuf = (param.nphaselen + param.nanglelen) * sizeof(float) + gpu[gpuid].autoblock * (cfg->issaveseed * (RAND_BUF_LEN * sizeof(RandType)) + sizeof(float) * (param.w0offset + cfg->srcnum + 2 * (cfg->outputtype == otRF))); MCX_FPRINTF(cfg->flog, "requesting %d bytes of shared memory\n", sharedbuf); @@ -3176,7 +3177,7 @@ void mcx_run_simulation(Config* cfg, GPUInfo* gpu) { */ /** \c ispencil: template constant, if 1, launch photon code is dramatically simplified */ - int ispencil = (cfg->srctype == MCX_SRC_PENCIL && cfg->nangle < 2); + int ispencil = (cfg->srctype == MCX_SRC_PENCIL && cfg->nangle == 0); /** \c isref: template constant, if 1, perform boundary reflection, if 0, total-absorbion boundary, can simplify kernel */ int isref = cfg->isreflect; diff --git a/src/mcx_core.h b/src/mcx_core.h index 6bdff701..847474f6 100644 --- a/src/mcx_core.h +++ b/src/mcx_core.h @@ -181,7 +181,9 @@ typedef struct __align__(16) KernelParams { int replaydet; /**< select which detector to replay, 0 for all, -1 save all separately */ unsigned int srcnum; /**< total number of source patterns */ unsigned int nphase; /**< number of samples for inverse-cdf, will be added by 2 to include -1 and 1 on the two ends */ + unsigned int nphaselen; /**< even-rounded nphase so that shared memory buffer won't give an error */ unsigned int nangle; /**< number of samples for launch angle inverse-cdf, will be added by 2 to include 0 and 1 on the two ends */ + unsigned int nanglelen; /**< even-rounded nangle so that shared memory buffer won't give an error */ float omega; /**< modulation angular frequency (2*pi*f), in rad/s, for FD/RF replay */ unsigned char bc[12]; /**< boundary condition flags, copy the first 12 chars from cfg->bc without the terminating NULL */ } MCXParam; diff --git a/src/mcx_utils.c b/src/mcx_utils.c index 179eed16..176773ad 100644 --- a/src/mcx_utils.c +++ b/src/mcx_utils.c @@ -2646,12 +2646,12 @@ int mcx_loadjson(cJSON* root, Config* cfg) { if (subitem) { int nangle = cJSON_GetArraySize(subitem); - cfg->nangle = nangle; if (cfg->angleinvcdf) { free(cfg->angleinvcdf); } + cfg->nangle = nangle; cfg->angleinvcdf = (float*)calloc(cfg->nangle, sizeof(float)); vv = subitem->child;