Add processBulk to perform fits in parallel

mconcas · Sep 18, 2024 · b7a7cb2 · b7a7cb2
1 parent c981ee0
commit b7a7cb2
Show file tree

Hide file tree

Showing 4 changed files with 659 additions and 74 deletions.
diff --git a/Common/DCAFitter/GPU/cuda/DCAFitterN.cu b/Common/DCAFitter/GPU/cuda/DCAFitterN.cu
@@ -36,34 +36,51 @@ namespace o2::vertexing::device
 {
 namespace kernel
 {
+GPUg() void warmUpGpuKernel()
+{
+  unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  float ia, ib;
+  ia = ib = 0.0f;
+  ib += ia + tid;
+}
+
 template <typename Fitter>
-GPUg() void printKernel(Fitter* ft)
+GPUg() void printKernel(Fitter* fitter)
 {
   if (threadIdx.x == 0) {
-    printf(" =============== GPU DCA Fitter %d prongs ================\n", Fitter::getNProngs());
-    ft->print();
+    printf(" =============== GPU DCA Fitter %d prongs =================\n", Fitter::getNProngs());
+    fitter->print();
     printf(" =========================================================\n");
   }
 }
 
 template <typename Fitter, typename... Tr>
-GPUg() void processKernel(Fitter* ft, int* res, Tr*... tracks)
+GPUg() void processKernel(Fitter* fitter, int* res, Tr*... tracks)
 {
-  *res = ft->process(*tracks...);
+  *res = fitter->process(*tracks...);
 }
+
+template <typename Fitter, typename... Tr>
+GPUg() void processBulkKernel(Fitter* fitters, int* results, unsigned int N, Tr*... tracks)
+{
+  for (auto iThread{blockIdx.x * blockDim.x + threadIdx.x}; iThread < N; iThread += blockDim.x * gridDim.x) {
+    results[iThread] = fitters[iThread].process(tracks[iThread]...);
+  }
+}
+
 } // namespace kernel
 
 /// CPU handlers
 template <typename Fitter>
 void print(const int nBlocks,
            const int nThreads,
-           Fitter& ft)
+           Fitter& fitter)
 {
-  Fitter* ft_device;
-  gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&ft_device), sizeof(Fitter)));
-  gpuCheckError(cudaMemcpy(ft_device, &ft, sizeof(Fitter), cudaMemcpyHostToDevice));
+  Fitter* fitter_device;
+  gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&fitter_device), sizeof(Fitter)));
+  gpuCheckError(cudaMemcpy(fitter_device, &fitter, sizeof(Fitter), cudaMemcpyHostToDevice));
 
-  kernel::printKernel<<<nBlocks, nThreads>>>(ft_device);
+  kernel::printKernel<<<nBlocks, nThreads>>>(fitter_device);
 
   gpuCheckError(cudaPeekAtLastError());
   gpuCheckError(cudaDeviceSynchronize());
@@ -75,11 +92,11 @@ int process(const int nBlocks,
             Fitter& fitter,
             Tr&... args)
 {
-  Fitter* ft_device;
+  Fitter* fitter_device;
   std::array<o2::track::TrackParCov*, Fitter::getNProngs()> tracks_device;
   int result, *result_device;
 
-  gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&ft_device), sizeof(Fitter)));
+  gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&fitter_device), sizeof(Fitter)));
   gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&result_device), sizeof(int)));
 
   int iArg{0};
@@ -90,15 +107,15 @@ int process(const int nBlocks,
   }(),
    ...);
 
-  gpuCheckError(cudaMemcpy(ft_device, &fitter, sizeof(Fitter), cudaMemcpyHostToDevice));
+  gpuCheckError(cudaMemcpy(fitter_device, &fitter, sizeof(Fitter), cudaMemcpyHostToDevice));
 
-  std::apply([&](auto&&... args) { kernel::processKernel<<<nBlocks, nThreads>>>(ft_device, result_device, args...); }, tracks_device);
+  std::apply([&](auto&&... args) { kernel::processKernel<<<nBlocks, nThreads>>>(fitter_device, result_device, args...); }, tracks_device);
 
   gpuCheckError(cudaPeekAtLastError());
   gpuCheckError(cudaDeviceSynchronize());
 
   gpuCheckError(cudaMemcpy(&result, result_device, sizeof(int), cudaMemcpyDeviceToHost));
-  gpuCheckError(cudaMemcpy(&fitter, ft_device, sizeof(Fitter), cudaMemcpyDeviceToHost));
+  gpuCheckError(cudaMemcpy(&fitter, fitter_device, sizeof(Fitter), cudaMemcpyDeviceToHost));
   iArg = 0;
   ([&] {
     gpuCheckError(cudaMemcpy(&args, tracks_device[iArg], sizeof(o2::track::TrackParCov), cudaMemcpyDeviceToHost));
@@ -107,11 +124,71 @@ int process(const int nBlocks,
   }(),
    ...);
 
+  gpuCheckError(cudaFree(fitter_device));
   gpuCheckError(cudaFree(result_device));
 
   return result;
 }
 
+template <typename Fitter, class... Tr>
+std::vector<int> processBulk(const int nBlocks,
+                             const int nThreads,
+                             std::vector<Fitter>& fitters,
+                             std::vector<Tr>&... args)
+{
+  kernel::warmUpGpuKernel<<<1, 1>>>();
+
+  cudaEvent_t start, stop;
+  gpuCheckError(cudaEventCreate(&start));
+  gpuCheckError(cudaEventCreate(&stop));
+  const auto nFits{fitters.size()}; // for clarity: size of all the vectors needs to be equal, not enforcing it here yet.
+  std::vector<int> results(nFits);
+  int* results_device;
+  Fitter* fitters_device;
+  std::array<o2::track::TrackParCov*, Fitter::getNProngs()> tracks_device;
+
+  int iArg{0};
+  ([&] {
+    gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&(tracks_device[iArg])), sizeof(Tr) * args.size()));
+    gpuCheckError(cudaMemcpy(tracks_device[iArg], args.data(), sizeof(Tr) * args.size(), cudaMemcpyHostToDevice));
+    ++iArg;
+  }(),
+   ...);
+  gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&results_device), sizeof(int) * nFits));
+  gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&fitters_device), sizeof(Fitter) * nFits));
+  gpuCheckError(cudaMemcpy(fitters_device, fitters.data(), sizeof(Fitter) * nFits, cudaMemcpyHostToDevice));
+
+  gpuCheckError(cudaEventRecord(start));
+  std::apply([&](auto&&... args) { kernel::processBulkKernel<<<nBlocks, nThreads>>>(fitters_device, results_device, nFits, args...); }, tracks_device);
+  gpuCheckError(cudaEventRecord(stop));
+
+  gpuCheckError(cudaPeekAtLastError());
+  gpuCheckError(cudaDeviceSynchronize());
+
+  gpuCheckError(cudaMemcpy(results.data(), results_device, sizeof(int) * results.size(), cudaMemcpyDeviceToHost));
+  gpuCheckError(cudaMemcpy(fitters.data(), fitters_device, sizeof(Fitter) * nFits, cudaMemcpyDeviceToHost));
+
+  iArg = 0;
+  ([&] {
+    gpuCheckError(cudaMemcpy(args.data(), tracks_device[iArg], sizeof(Tr) * args.size(), cudaMemcpyDeviceToHost));
+    gpuCheckError(cudaFree(tracks_device[iArg]));
+    ++iArg;
+  }(),
+   ...);
+
+  gpuCheckError(cudaFree(fitters_device));
+  gpuCheckError(cudaFree(results_device));
+  gpuCheckError(cudaEventSynchronize(stop));
+
+  float milliseconds = 0;
+  gpuCheckError(cudaEventElapsedTime(&milliseconds, start, stop));
+
+  LOGP(info, "Kernel run in: {} ms using {} blocks and {} threads.", milliseconds, nBlocks, nThreads);
+  return results;
+}
+
+template std::vector<int> processBulk(const int, const int, std::vector<o2::vertexing::DCAFitterN<2>>&, std::vector<o2::track::TrackParCov>&, std::vector<o2::track::TrackParCov>&);
+template std::vector<int> processBulk(const int, const int, std::vector<o2::vertexing::DCAFitterN<3>>&, std::vector<o2::track::TrackParCov>&, std::vector<o2::track::TrackParCov>&, std::vector<o2::track::TrackParCov>&);
 template int process(const int, const int, o2::vertexing::DCAFitterN<2>&, o2::track::TrackParCov&, o2::track::TrackParCov&);
 template int process(const int, const int, o2::vertexing::DCAFitterN<3>&, o2::track::TrackParCov&, o2::track::TrackParCov&, o2::track::TrackParCov&);
 template void print(const int, const int, o2::vertexing::DCAFitterN<2>&);