From be5463887d23ff9309a395c18ed9817bbe8012db Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Wed, 2 Oct 2024 15:59:11 +0200 Subject: [PATCH] DCAFitterGPU: reduce I/O overhead by copying elements using a kernel --- Common/DCAFitter/GPU/cuda/DCAFitterN.cu | 43 +++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/Common/DCAFitter/GPU/cuda/DCAFitterN.cu b/Common/DCAFitter/GPU/cuda/DCAFitterN.cu index ab53ae25d7548..30249e26e37f2 100644 --- a/Common/DCAFitter/GPU/cuda/DCAFitterN.cu +++ b/Common/DCAFitter/GPU/cuda/DCAFitterN.cu @@ -56,6 +56,14 @@ GPUg() void printKernel(Fitter* fitter) } } +template +GPUg() void initFitters(Fitter* fitters, unsigned int off, unsigned int N) +{ + for (auto iThread{blockIdx.x * blockDim.x + threadIdx.x + 1}; iThread < N; iThread += blockDim.x * gridDim.x) { + fitters[iThread + off] = fitters[off]; + } +} + template GPUg() void processKernel(Fitter* fitter, int* res, Tr*... tracks) { @@ -63,7 +71,11 @@ GPUg() void processKernel(Fitter* fitter, int* res, Tr*... tracks) } template +<<<<<<< Updated upstream GPUg() void processBatchKernel(Fitter* fitters, int* results, size_t off, size_t N, Tr*... tracks) +======= +GPUg() void processBatchKernel(Fitter* fitters, int* results, unsigned int off, unsigned int N, Tr*... tracks) +>>>>>>> Stashed changes { for (auto iThread{blockIdx.x * blockDim.x + threadIdx.x}; iThread < N; iThread += blockDim.x * gridDim.x) { results[iThread + off] = fitters[iThread + off].process(tracks[iThread + off]...); @@ -186,7 +198,11 @@ void processBulk(const int nBlocks, auto nFits = batchSize + (iBatch < remainder ? 1 : 0); gpuCheckError(cudaEventRecord(startIOUp[iBatch], stream)); +<<<<<<< Updated upstream gpuCheckError(cudaMemcpyAsync(fitters_device + offset, fitters.data() + offset, sizeof(Fitter) * nFits, cudaMemcpyHostToDevice, stream)); +======= + gpuCheckError(cudaMemcpyAsync(fitters_device + offset, fitters.data() + offset, sizeof(Fitter) /* * nFits */, cudaMemcpyHostToDevice, stream)); // copying just the first element of the buffer +>>>>>>> Stashed changes iArg = 0; ([&] { gpuCheckError(cudaMemcpyAsync(tracks_device[iArg] + offset, args.data() + offset, sizeof(Tr) * nFits, cudaMemcpyHostToDevice, stream)); @@ -196,6 +212,10 @@ void processBulk(const int nBlocks, gpuCheckError(cudaEventRecord(endIOUp[iBatch], stream)); gpuCheckError(cudaEventRecord(startKer[iBatch], stream)); +<<<<<<< Updated upstream +======= + kernel::initFitters<<>>(fitters_device, offset, nFits); +>>>>>>> Stashed changes std::apply([&](auto&&... args) { kernel::processBatchKernel<<>>(fitters_device, results_device, offset, nFits, args...); }, tracks_device); gpuCheckError(cudaEventRecord(endKer[iBatch], stream)); @@ -212,6 +232,28 @@ void processBulk(const int nBlocks, gpuCheckError(cudaMemcpyAsync(results.data() + offset, results_device + offset, sizeof(int) * nFits, cudaMemcpyDeviceToHost, stream)); gpuCheckError(cudaEventRecord(endIODown[iBatch], stream)); } +<<<<<<< Updated upstream + + ([&] { gpuInterface->unregisterBuffer(args.data()); }(), ...); + + for (auto* tracksD : tracks_device) { + gpuInterface->freeDevice(tracksD); + } + + gpuInterface->freeDevice(fitters_device); + gpuInterface->freeDevice(results_device); + gpuInterface->unregisterBuffer(fitters.data()); + gpuInterface->unregisterBuffer(results.data()); + + // Do benchmarks + gpuCheckError(cudaDeviceSynchronize()); + for (int iBatch{0}; iBatch < nBatches; ++iBatch) { + gpuCheckError(cudaEventElapsedTime(&ioUp[iBatch], startIOUp[iBatch], endIOUp[iBatch])); + gpuCheckError(cudaEventElapsedTime(&kerElapsed[iBatch], startKer[iBatch], endKer[iBatch])); + gpuCheckError(cudaEventElapsedTime(&ioDown[iBatch], startIODown[iBatch], endIODown[iBatch])); + } + +======= ([&] { gpuInterface->unregisterBuffer(args.data()); }(), ...); @@ -232,6 +274,7 @@ void processBulk(const int nBlocks, gpuCheckError(cudaEventElapsedTime(&ioDown[iBatch], startIODown[iBatch], endIODown[iBatch])); } +>>>>>>> Stashed changes float totalUp = std::accumulate(ioUp.begin(), ioUp.end(), 0.f); float totalDown = std::accumulate(ioDown.begin(), ioDown.end(), 0.f); float totalKernels = std::accumulate(kerElapsed.begin(), kerElapsed.end(), 0.f);