Skip to content

Commit

Permalink
DCAFitterGPU: reduce I/O overhead by copying elements using a kernel
Browse files Browse the repository at this point in the history
  • Loading branch information
mconcas committed Oct 2, 2024
1 parent 8572af8 commit be54638
Showing 1 changed file with 43 additions and 0 deletions.
43 changes: 43 additions & 0 deletions Common/DCAFitter/GPU/cuda/DCAFitterN.cu
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,26 @@ GPUg() void printKernel(Fitter* fitter)
}
}

template <typename Fitter>
GPUg() void initFitters(Fitter* fitters, unsigned int off, unsigned int N)
{
for (auto iThread{blockIdx.x * blockDim.x + threadIdx.x + 1}; iThread < N; iThread += blockDim.x * gridDim.x) {
fitters[iThread + off] = fitters[off];
}
}

template <typename Fitter, typename... Tr>
GPUg() void processKernel(Fitter* fitter, int* res, Tr*... tracks)
{
*res = fitter->process(*tracks...);
}

template <typename Fitter, typename... Tr>
<<<<<<< Updated upstream
GPUg() void processBatchKernel(Fitter* fitters, int* results, size_t off, size_t N, Tr*... tracks)
=======
GPUg() void processBatchKernel(Fitter* fitters, int* results, unsigned int off, unsigned int N, Tr*... tracks)
>>>>>>> Stashed changes
{
for (auto iThread{blockIdx.x * blockDim.x + threadIdx.x}; iThread < N; iThread += blockDim.x * gridDim.x) {
results[iThread + off] = fitters[iThread + off].process(tracks[iThread + off]...);
Expand Down Expand Up @@ -186,7 +198,11 @@ void processBulk(const int nBlocks,
auto nFits = batchSize + (iBatch < remainder ? 1 : 0);
gpuCheckError(cudaEventRecord(startIOUp[iBatch], stream));
<<<<<<< Updated upstream
gpuCheckError(cudaMemcpyAsync(fitters_device + offset, fitters.data() + offset, sizeof(Fitter) * nFits, cudaMemcpyHostToDevice, stream));
=======
gpuCheckError(cudaMemcpyAsync(fitters_device + offset, fitters.data() + offset, sizeof(Fitter) /* * nFits */, cudaMemcpyHostToDevice, stream)); // copying just the first element of the buffer
>>>>>>> Stashed changes
iArg = 0;
([&] {
gpuCheckError(cudaMemcpyAsync(tracks_device[iArg] + offset, args.data() + offset, sizeof(Tr) * nFits, cudaMemcpyHostToDevice, stream));
Expand All @@ -196,6 +212,10 @@ void processBulk(const int nBlocks,
gpuCheckError(cudaEventRecord(endIOUp[iBatch], stream));
gpuCheckError(cudaEventRecord(startKer[iBatch], stream));
<<<<<<< Updated upstream
=======
kernel::initFitters<<<nBlocks, nThreads, 0, stream>>>(fitters_device, offset, nFits);
>>>>>>> Stashed changes
std::apply([&](auto&&... args) { kernel::processBatchKernel<<<nBlocks, nThreads, 0, stream>>>(fitters_device, results_device, offset, nFits, args...); }, tracks_device);
gpuCheckError(cudaEventRecord(endKer[iBatch], stream));
Expand All @@ -212,6 +232,28 @@ void processBulk(const int nBlocks,
gpuCheckError(cudaMemcpyAsync(results.data() + offset, results_device + offset, sizeof(int) * nFits, cudaMemcpyDeviceToHost, stream));
gpuCheckError(cudaEventRecord(endIODown[iBatch], stream));
}
<<<<<<< Updated upstream
([&] { gpuInterface->unregisterBuffer(args.data()); }(), ...);
for (auto* tracksD : tracks_device) {
gpuInterface->freeDevice(tracksD);
}
gpuInterface->freeDevice(fitters_device);
gpuInterface->freeDevice(results_device);
gpuInterface->unregisterBuffer(fitters.data());
gpuInterface->unregisterBuffer(results.data());
// Do benchmarks
gpuCheckError(cudaDeviceSynchronize());
for (int iBatch{0}; iBatch < nBatches; ++iBatch) {
gpuCheckError(cudaEventElapsedTime(&ioUp[iBatch], startIOUp[iBatch], endIOUp[iBatch]));
gpuCheckError(cudaEventElapsedTime(&kerElapsed[iBatch], startKer[iBatch], endKer[iBatch]));
gpuCheckError(cudaEventElapsedTime(&ioDown[iBatch], startIODown[iBatch], endIODown[iBatch]));
}
=======
([&] { gpuInterface->unregisterBuffer(args.data()); }(), ...);
Expand All @@ -232,6 +274,7 @@ void processBulk(const int nBlocks,
gpuCheckError(cudaEventElapsedTime(&ioDown[iBatch], startIODown[iBatch], endIODown[iBatch]));
}
>>>>>>> Stashed changes
float totalUp = std::accumulate(ioUp.begin(), ioUp.end(), 0.f);
float totalDown = std::accumulate(ioDown.begin(), ioDown.end(), 0.f);
float totalKernels = std::accumulate(kerElapsed.begin(), kerElapsed.end(), 0.f);
Expand Down

0 comments on commit be54638

Please sign in to comment.