Add processBulk to perform fits in parallel

mconcas · Sep 18, 2024 · 350cf3c · 350cf3c
1 parent c981ee0
commit 350cf3c
Show file tree

Hide file tree

Showing 4 changed files with 209 additions and 26 deletions.
diff --git a/Common/DCAFitter/GPU/cuda/DCAFitterN.cu b/Common/DCAFitter/GPU/cuda/DCAFitterN.cu
@@ -37,33 +37,42 @@ namespace o2::vertexing::device
 namespace kernel
 {
 template <typename Fitter>
-GPUg() void printKernel(Fitter* ft)
+GPUg() void printKernel(Fitter* fitter)
 {
   if (threadIdx.x == 0) {
-    printf(" =============== GPU DCA Fitter %d prongs ================\n", Fitter::getNProngs());
-    ft->print();
+    printf(" =============== GPU DCA Fitter %d prongs =================\n", Fitter::getNProngs());
+    fitter->print();
     printf(" =========================================================\n");
   }
 }
 
 template <typename Fitter, typename... Tr>
-GPUg() void processKernel(Fitter* ft, int* res, Tr*... tracks)
+GPUg() void processKernel(Fitter* fitter, int* res, Tr*... tracks)
 {
-  *res = ft->process(*tracks...);
+  *res = fitter->process(*tracks...);
 }
+
+template <typename Fitter, typename... Tr>
+GPUg() void processBulkKernel(Fitter* fitters, int* results, unsigned int N, Tr*... tracks)
+{
+  for (auto iThread{blockIdx.x * blockDim.x + threadIdx.x}; iThread < N; iThread += blockDim.x * gridDim.x) {
+    results[iThread] = fitters[iThread].process(tracks[iThread]...);
+  }
+}
+
 } // namespace kernel
 
 /// CPU handlers
 template <typename Fitter>
 void print(const int nBlocks,
            const int nThreads,
-           Fitter& ft)
+           Fitter& fitter)
 {
-  Fitter* ft_device;
-  gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&ft_device), sizeof(Fitter)));
-  gpuCheckError(cudaMemcpy(ft_device, &ft, sizeof(Fitter), cudaMemcpyHostToDevice));
+  Fitter* fitter_device;
+  gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&fitter_device), sizeof(Fitter)));
+  gpuCheckError(cudaMemcpy(fitter_device, &fitter, sizeof(Fitter), cudaMemcpyHostToDevice));
 
-  kernel::printKernel<<<nBlocks, nThreads>>>(ft_device);
+  kernel::printKernel<<<nBlocks, nThreads>>>(fitter_device);
 
   gpuCheckError(cudaPeekAtLastError());
   gpuCheckError(cudaDeviceSynchronize());
@@ -75,11 +84,11 @@ int process(const int nBlocks,
             Fitter& fitter,
             Tr&... args)
 {
-  Fitter* ft_device;
+  Fitter* fitter_device;
   std::array<o2::track::TrackParCov*, Fitter::getNProngs()> tracks_device;
   int result, *result_device;
 
-  gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&ft_device), sizeof(Fitter)));
+  gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&fitter_device), sizeof(Fitter)));
   gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&result_device), sizeof(int)));
 
   int iArg{0};
@@ -90,15 +99,15 @@ int process(const int nBlocks,
   }(),
    ...);
 
-  gpuCheckError(cudaMemcpy(ft_device, &fitter, sizeof(Fitter), cudaMemcpyHostToDevice));
+  gpuCheckError(cudaMemcpy(fitter_device, &fitter, sizeof(Fitter), cudaMemcpyHostToDevice));
 
-  std::apply([&](auto&&... args) { kernel::processKernel<<<nBlocks, nThreads>>>(ft_device, result_device, args...); }, tracks_device);
+  std::apply([&](auto&&... args) { kernel::processKernel<<<nBlocks, nThreads>>>(fitter_device, result_device, args...); }, tracks_device);
 
   gpuCheckError(cudaPeekAtLastError());
   gpuCheckError(cudaDeviceSynchronize());
 
   gpuCheckError(cudaMemcpy(&result, result_device, sizeof(int), cudaMemcpyDeviceToHost));
-  gpuCheckError(cudaMemcpy(&fitter, ft_device, sizeof(Fitter), cudaMemcpyDeviceToHost));
+  gpuCheckError(cudaMemcpy(&fitter, fitter_device, sizeof(Fitter), cudaMemcpyDeviceToHost));
   iArg = 0;
   ([&] {
     gpuCheckError(cudaMemcpy(&args, tracks_device[iArg], sizeof(o2::track::TrackParCov), cudaMemcpyDeviceToHost));
@@ -112,6 +121,63 @@ int process(const int nBlocks,
   return result;
 }
 
+template <typename Fitter, class... Tr>
+std::vector<int> processBulk(const int nBlocks,
+                             const int nThreads,
+                             std::vector<Fitter>& fitters,
+                             std::vector<Tr>&... args)
+{
+  cudaEvent_t start, stop;
+  gpuCheckError(cudaEventCreate(&start));
+  gpuCheckError(cudaEventCreate(&stop));
+  const auto nFits{fitters.size()}; // for clarity: size of all the vectors needs to be equal, not enforcing it here yet.
+  std::vector<int> results(nFits);
+  int* results_device;
+  Fitter* fitters_device;
+  std::array<o2::track::TrackParCov*, Fitter::getNProngs()> tracks_device;
+
+  int iArg{0};
+  ([&] {
+    gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&(tracks_device[iArg])), sizeof(Tr) * args.size()));
+    gpuCheckError(cudaMemcpy(tracks_device[iArg], args.data(), sizeof(Tr) * args.size(), cudaMemcpyHostToDevice));
+    ++iArg;
+  }(),
+   ...);
+  gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&results_device), sizeof(int) * nFits));
+  gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&fitters_device), sizeof(Fitter) * nFits));
+  gpuCheckError(cudaMemcpy(fitters_device, fitters.data(), sizeof(Fitter) * nFits, cudaMemcpyHostToDevice));
+
+  // Call kernel
+  gpuCheckError(cudaEventRecord(start));
+  std::apply([&](auto&&... args) { kernel::processBulkKernel<<<nBlocks, nThreads>>>(fitters_device, results_device, nFits, args...); }, tracks_device);
+  gpuCheckError(cudaEventRecord(stop));
+
+  gpuCheckError(cudaPeekAtLastError());
+  gpuCheckError(cudaDeviceSynchronize());
+
+  gpuCheckError(cudaMemcpy(results.data(), results_device, sizeof(int) * results.size(), cudaMemcpyDeviceToHost));
+  gpuCheckError(cudaMemcpy(fitters.data(), fitters_device, sizeof(Fitter) * nFits, cudaMemcpyDeviceToHost));
+
+  iArg = 0;
+  ([&] {
+    gpuCheckError(cudaMemcpy(args.data(), tracks_device[iArg], sizeof(Tr) * args.size(), cudaMemcpyDeviceToHost));
+    gpuCheckError(cudaFree(tracks_device[iArg]));
+    ++iArg;
+  }(),
+   ...);
+
+  gpuCheckError(cudaFree(fitters_device));
+  gpuCheckError(cudaFree(results_device));
+  gpuCheckError(cudaEventSynchronize(stop));
+
+  float milliseconds = 0;
+  gpuCheckError(cudaEventElapsedTime(&milliseconds, start, stop));
+
+  LOGP(info, "Kernel run in: {} ms using {} blocks and {} threads.", milliseconds, nBlocks, nThreads);
+  return results;
+}
+
+template std::vector<int> processBulk(const int, const int, std::vector<o2::vertexing::DCAFitterN<2>>&, std::vector<o2::track::TrackParCov>&, std::vector<o2::track::TrackParCov>&);
 template int process(const int, const int, o2::vertexing::DCAFitterN<2>&, o2::track::TrackParCov&, o2::track::TrackParCov&);
 template int process(const int, const int, o2::vertexing::DCAFitterN<3>&, o2::track::TrackParCov&, o2::track::TrackParCov&, o2::track::TrackParCov&);
 template void print(const int, const int, o2::vertexing::DCAFitterN<2>&);

diff --git a/Common/DCAFitter/GPU/cuda/test/testDCAFitterNGPU.cxx b/Common/DCAFitter/GPU/cuda/test/testDCAFitterNGPU.cxx
@@ -224,7 +224,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngs)
     }
     device::print(1, 1, ft);
     meanDA /= nfoundA ? nfoundA : 1;
-    meanDAW /= nfoundA ? nfoundA : 1;
+    meanDAW /= nfoundAW ? nfoundA : 1;
     meanDW /= nfoundW ? nfoundW : 1;
     LOG(info) << "Processed " << NTest << " 2-prong vertices Helix : Helix";
     LOG(info) << "2-prongs with abs.dist minization: eff= " << float(nfoundA) / NTest
@@ -554,5 +554,124 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngs)
   outStream.Close();
 }
 
+BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk)
+{
+  gRandom->Delete();
+  gRandom = new TRandom(42);
+  constexpr int NTest = 10000;
+  o2::utils::TreeStreamRedirector outStreamB("dcafitterNTestBulk.root");
+
+  TGenPhaseSpace genPHS;
+  constexpr double ele = 0.00051;
+  constexpr double gamma = 2 * ele + 1e-6;
+  constexpr double pion = 0.13957;
+  constexpr double k0 = 0.49761;
+  constexpr double kch = 0.49368;
+  constexpr double dch = 1.86965;
+  std::vector<double> gammadec = {ele, ele};
+  std::vector<double> k0dec = {pion, pion};
+  std::vector<double> dchdec = {pion, kch, pion};
+  std::vector<std::vector<o2::track::TrackParCov>> vctracks(2, std::vector<o2::track::TrackParCov>(NTest));
+  std::vector<Vec3D> vtxGen(NTest);
+
+  double bz = 5.0;
+  { // 2 prongs vertices bulk processing
+    LOG(info) << "Bulk-processing 2-prong Helix - Helix case";
+    std::vector<int> forceQ{1, 1};
+
+    o2::vertexing::DCAFitterN<2> ft; // 2 prong fitter
+    ft.setBz(bz);
+    ft.setPropagateToPCA(true);  // After finding the vertex, propagate tracks to the DCA. This is default anyway
+    ft.setMaxR(200);             // do not consider V0 seeds with 2D circles crossing above this R. This is default anyway
+    ft.setMaxDZIni(4);           // do not consider V0 seeds with tracks Z-distance exceeding this. This is default anyway
+    ft.setMaxDXYIni(4);          // do not consider V0 seeds with tracks XY-distance exceeding this. This is default anyway
+    ft.setMinParamChange(1e-3);  // stop iterations if max correction is below this value. This is default anyway
+    ft.setMinRelChi2Change(0.9); // stop iterations if chi2 improves by less that this factor
+
+    std::vector<o2::vertexing::DCAFitterN<2>> fitters_host;
+    std::vector<TLorentzVector> genParents(NTest);
+    fitters_host.resize(NTest);
+
+    std::string treeName2Abulk = "pr2aBulk", treeName2AWbulk = "pr2awBulk", treeName2Wbulk = "pr2wBulk";
+    TStopwatch swAb, swAWb, swWb;
+    int nfoundAb = 0, nfoundAWb = 0, nfoundWb = 0;
+    double meanDAb = 0, meanDAWb = 0, meanDWb = 0;
+    swAb.Stop();
+    swAWb.Stop();
+    swWb.Stop();
+
+    ft.setUseAbsDCA(true);
+    std::fill(fitters_host.begin(), fitters_host.end(), ft);
+    for (int iev = 0; iev < NTest; iev++) {
+      std::vector<o2::track::TrackParCov> vc(2);
+      genParents[iev] = generate(vtxGen[iev], vc, bz, genPHS, k0, k0dec, forceQ);
+      vctracks[0][iev] = vc[0];
+      vctracks[1][iev] = vc[1];
+    }
+
+    swAb.Start(false);
+    auto ncAb = device::processBulk(20, 512, fitters_host, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
+    swAb.Stop();
+
+    for (int iev = 0; iev < NTest; iev++) {
+      LOG(debug) << "fit abs.dist " << iev << " NC: " << ncAb[iev] << " Chi2: " << (ncAb[iev] ? fitters_host[iev].getChi2AtPCACandidate(0) : -1);
+      if (ncAb[iev]) {
+        auto minDb = checkResults(outStreamB, treeName2Abulk, fitters_host[iev], vtxGen[iev], genParents[iev], k0dec);
+        meanDAb += minDb;
+        nfoundAb++;
+      }
+    }
+
+    ft.setUseAbsDCA(true);
+    ft.setWeightedFinalPCA(true);
+    std::fill(fitters_host.begin(), fitters_host.end(), ft);
+    swAWb.Start(false);
+    auto ncAWb = device::processBulk(20, 512, fitters_host, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
+    swAWb.Stop();
+
+    for (int iev = 0; iev < NTest; iev++) {
+      LOG(debug) << "fit abs.dist with final weighted DCA " << iev << " NC: " << ncAWb[iev] << " Chi2: " << (ncAWb[iev] ? fitters_host[iev].getChi2AtPCACandidate(0) : -1);
+      if (ncAWb[iev]) {
+        auto minDb = checkResults(outStreamB, treeName2AWbulk, fitters_host[iev], vtxGen[iev], genParents[iev], k0dec);
+        meanDAWb += minDb;
+        nfoundAWb++;
+      }
+    }
+
+    ft.setUseAbsDCA(false);
+    ft.setWeightedFinalPCA(false);
+    std::fill(fitters_host.begin(), fitters_host.end(), ft);
+    swWb.Start(false);
+    auto ncWb = device::processBulk(20, 512, fitters_host, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
+    swWb.Stop();
+
+    for (int iev = 0; iev < NTest; iev++) {
+      LOG(debug) << "fit wgh.dist " << iev << " NC: " << ncWb[iev] << " Chi2: " << (ncWb[iev] ? fitters_host[iev].getChi2AtPCACandidate(0) : -1);
+      if (ncWb[iev]) {
+        auto minDb = checkResults(outStreamB, treeName2Wbulk, fitters_host[iev], vtxGen[iev], genParents[iev], k0dec);
+        meanDWb += minDb;
+        nfoundWb++;
+      }
+    }
+
+    meanDAb /= nfoundAb ? nfoundAb : 1;
+    meanDAWb /= nfoundAWb ? nfoundAb : 1;
+    meanDWb /= nfoundWb ? nfoundWb : 1;
+    LOGP(info, "Processed {} 2-prong vertices Helix : Helix", NTest);
+    LOG(info) << "2-prongs with abs.dist minization: eff= " << float(nfoundAb) / NTest
+              << " mean.dist to truth: " << meanDAb << " GPU time: " << swAb.CpuTime();
+    LOG(info) << "2-prongs with abs.dist but wghPCA: eff= " << float(nfoundAWb) / NTest
+              << " mean.dist to truth: " << meanDAWb << " GPU time: " << swAWb.CpuTime();
+    LOG(info) << "2-prongs with wgh.dist minization: eff= " << float(nfoundWb) / NTest
+              << " mean.dist to truth: " << meanDWb << " GPU time: " << swWb.CpuTime();
+    BOOST_CHECK(nfoundAb > 0.99 * NTest);
+    BOOST_CHECK(nfoundAWb > 0.99 * NTest);
+    BOOST_CHECK(nfoundWb > 0.99 * NTest);
+    BOOST_CHECK(meanDAb < 0.1);
+    BOOST_CHECK(meanDAWb < 0.1);
+    BOOST_CHECK(meanDWb < 0.1);
+  }
+}
+
 } // namespace vertexing
 } // namespace o2
diff --git a/Common/DCAFitter/include/DCAFitter/DCAFitterN.h b/Common/DCAFitter/include/DCAFitter/DCAFitterN.h
@@ -17,10 +17,10 @@
 #ifndef _ALICEO2_DCA_FITTERN_
 #define _ALICEO2_DCA_FITTERN_
 
-#include "MathUtils/Cartesian.h"
-#include "ReconstructionDataFormats/Track.h"
 #include "DCAFitter/HelixHelper.h"
 #include "DetectorsBase/Propagator.h"
+#include "MathUtils/Cartesian.h"
+#include "ReconstructionDataFormats/Track.h"
 
 namespace o2
 {
@@ -1136,15 +1136,13 @@ using DCAFitter3 = DCAFitterN<3, o2::track::TrackParCov>;
 namespace device
 {
 template <typename Fitter>
-void print(const int nBlocks,
-           const int nThreads,
-           Fitter& ft);
+void print(const int nBlocks, const int nThreads, Fitter& ft);
 
 template <typename Fitter, class... Tr>
-int process(const int nBlocks,
-            const int nThreads,
-            Fitter&,
-            Tr&... args);
+int process(const int nBlocks, const int nThreads, Fitter&, Tr&... args);
+
+template <class Fitter, class... Tr>
+std::vector<int> processBulk(const int nBlocks, const int nThreads, std::vector<Fitter>& fitters, std::vector<Tr>&... args);
 } // namespace device
 
 } // namespace vertexing

diff --git a/Common/DCAFitter/test/testDCAFitterN.cxx b/Common/DCAFitter/test/testDCAFitterN.cxx
@@ -223,7 +223,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngs)
     }
     ft.print();
     meanDA /= nfoundA ? nfoundA : 1;
-    meanDAW /= nfoundA ? nfoundA : 1;
+    meanDAW /= nfoundAW ? nfoundA : 1;
     meanDW /= nfoundW ? nfoundW : 1;
     LOG(info) << "Processed " << NTest << " 2-prong vertices Helix : Helix";
     LOG(info) << "2-prongs with abs.dist minization: eff= " << float(nfoundA) / NTest