From be5463887d23ff9309a395c18ed9817bbe8012db Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Wed, 2 Oct 2024 15:59:11 +0200
Subject: [PATCH] DCAFitterGPU: reduce I/O overhead by copying elements using a
 kernel

---
 Common/DCAFitter/GPU/cuda/DCAFitterN.cu | 43 +++++++++++++++++++++++++
 1 file changed, 43 insertions(+)
diff --git a/Common/DCAFitter/GPU/cuda/DCAFitterN.cu b/Common/DCAFitter/GPU/cuda/DCAFitterN.cu
index ab53ae25d7548..30249e26e37f2 100644
--- a/Common/DCAFitter/GPU/cuda/DCAFitterN.cu
+++ b/Common/DCAFitter/GPU/cuda/DCAFitterN.cu
@@ -56,6 +56,14 @@ GPUg() void printKernel(Fitter* fitter)
   }
 }
 
+template <typename Fitter>
+GPUg() void initFitters(Fitter* fitters, unsigned int off, unsigned int N)
+{
+  for (auto iThread{blockIdx.x * blockDim.x + threadIdx.x + 1}; iThread < N; iThread += blockDim.x * gridDim.x) {
+    fitters[iThread + off] = fitters[off];
+  }
+}
+
 template <typename Fitter, typename... Tr>
 GPUg() void processKernel(Fitter* fitter, int* res, Tr*... tracks)
 {
@@ -63,7 +71,11 @@ GPUg() void processKernel(Fitter* fitter, int* res, Tr*... tracks)
 }
 
 template <typename Fitter, typename... Tr>
+<<<<<<< Updated upstream
 GPUg() void processBatchKernel(Fitter* fitters, int* results, size_t off, size_t N, Tr*... tracks)
+=======
+GPUg() void processBatchKernel(Fitter* fitters, int* results, unsigned int off, unsigned int N, Tr*... tracks)
+>>>>>>> Stashed changes
 {
   for (auto iThread{blockIdx.x * blockDim.x + threadIdx.x}; iThread < N; iThread += blockDim.x * gridDim.x) {
     results[iThread + off] = fitters[iThread + off].process(tracks[iThread + off]...);
@@ -186,7 +198,11 @@ void processBulk(const int nBlocks,
     auto nFits = batchSize + (iBatch < remainder ? 1 : 0);
 
     gpuCheckError(cudaEventRecord(startIOUp[iBatch], stream));
+<<<<<<< Updated upstream
     gpuCheckError(cudaMemcpyAsync(fitters_device + offset, fitters.data() + offset, sizeof(Fitter) * nFits, cudaMemcpyHostToDevice, stream));
+=======
+    gpuCheckError(cudaMemcpyAsync(fitters_device + offset, fitters.data() + offset, sizeof(Fitter) /* * nFits */, cudaMemcpyHostToDevice, stream)); // copying just the first element of the buffer
+>>>>>>> Stashed changes
     iArg = 0;
     ([&] {
       gpuCheckError(cudaMemcpyAsync(tracks_device[iArg] + offset, args.data() + offset, sizeof(Tr) * nFits, cudaMemcpyHostToDevice, stream));
@@ -196,6 +212,10 @@ void processBulk(const int nBlocks,
     gpuCheckError(cudaEventRecord(endIOUp[iBatch], stream));
 
     gpuCheckError(cudaEventRecord(startKer[iBatch], stream));
+<<<<<<< Updated upstream
+=======
+    kernel::initFitters<<<nBlocks, nThreads, 0, stream>>>(fitters_device, offset, nFits);
+>>>>>>> Stashed changes
     std::apply([&](auto&&... args) { kernel::processBatchKernel<<<nBlocks, nThreads, 0, stream>>>(fitters_device, results_device, offset, nFits, args...); }, tracks_device);
     gpuCheckError(cudaEventRecord(endKer[iBatch], stream));
 
@@ -212,6 +232,28 @@ void processBulk(const int nBlocks,
     gpuCheckError(cudaMemcpyAsync(results.data() + offset, results_device + offset, sizeof(int) * nFits, cudaMemcpyDeviceToHost, stream));
     gpuCheckError(cudaEventRecord(endIODown[iBatch], stream));
   }
+<<<<<<< Updated upstream
+
+  ([&] { gpuInterface->unregisterBuffer(args.data()); }(), ...);
+
+  for (auto* tracksD : tracks_device) {
+    gpuInterface->freeDevice(tracksD);
+  }
+
+  gpuInterface->freeDevice(fitters_device);
+  gpuInterface->freeDevice(results_device);
+  gpuInterface->unregisterBuffer(fitters.data());
+  gpuInterface->unregisterBuffer(results.data());
+
+  // Do benchmarks
+  gpuCheckError(cudaDeviceSynchronize());
+  for (int iBatch{0}; iBatch < nBatches; ++iBatch) {
+    gpuCheckError(cudaEventElapsedTime(&ioUp[iBatch], startIOUp[iBatch], endIOUp[iBatch]));
+    gpuCheckError(cudaEventElapsedTime(&kerElapsed[iBatch], startKer[iBatch], endKer[iBatch]));
+    gpuCheckError(cudaEventElapsedTime(&ioDown[iBatch], startIODown[iBatch], endIODown[iBatch]));
+  }
+
+=======
 
   ([&] { gpuInterface->unregisterBuffer(args.data()); }(), ...);
 
@@ -232,6 +274,7 @@ void processBulk(const int nBlocks,
     gpuCheckError(cudaEventElapsedTime(&ioDown[iBatch], startIODown[iBatch], endIODown[iBatch]));
   }
 
+>>>>>>> Stashed changes
   float totalUp = std::accumulate(ioUp.begin(), ioUp.end(), 0.f);
   float totalDown = std::accumulate(ioDown.begin(), ioDown.end(), 0.f);
   float totalKernels = std::accumulate(kerElapsed.begin(), kerElapsed.end(), 0.f);