From 9d34af4e91f90bc327ec8c9b985a195037cb6c39 Mon Sep 17 00:00:00 2001
From: Matteo Concas <mett1990@gmail.com>
Date: Tue, 1 Oct 2024 19:57:30 +0200
Subject: [PATCH] Improve scheduling, benchmarking, and config

---
 .../GPU/DeviceInterface/GPUInterface.h        |   1 +
 Common/DCAFitter/GPU/cuda/CMakeLists.txt      |   2 +-
 Common/DCAFitter/GPU/cuda/DCAFitterN.cu       |  72 ++++++++----
 Common/DCAFitter/GPU/cuda/GPUInterface.cu     |   7 +-
 .../GPU/cuda/test/testDCAFitterNGPU.cxx       | 106 +++++++++---------
 Common/DCAFitter/GPU/hip/CMakeLists.txt       |   2 +-
 6 files changed, 113 insertions(+), 77 deletions(-)
diff --git a/Common/DCAFitter/GPU/DeviceInterface/GPUInterface.h b/Common/DCAFitter/GPU/DeviceInterface/GPUInterface.h
index effd13021d538..3aa5ead805acd 100644
--- a/Common/DCAFitter/GPU/DeviceInterface/GPUInterface.h
+++ b/Common/DCAFitter/GPU/DeviceInterface/GPUInterface.h
@@ -17,6 +17,7 @@
 
 #include <thread>
 #include <vector>
+#include <atomic>
 
 namespace o2
 {
diff --git a/Common/DCAFitter/GPU/cuda/CMakeLists.txt b/Common/DCAFitter/GPU/cuda/CMakeLists.txt
index 42c2b8f0d5059..ddc1d09445d7f 100644
--- a/Common/DCAFitter/GPU/cuda/CMakeLists.txt
+++ b/Common/DCAFitter/GPU/cuda/CMakeLists.txt
@@ -20,7 +20,7 @@ o2_add_library(DCAFitterCUDA
                                      O2::DetectorsBase
                PRIVATE_LINK_LIBRARIES O2::GPUTrackingCUDAExternalProvider)
 set_property(TARGET ${targetName} PROPERTY CUDA_SEPARABLE_COMPILATION ON)
-add_compile_options(-lineinfo)
+# add_compile_options(-lineinfo)
 
 o2_add_test(DCAFitterNCUDA
             SOURCES test/testDCAFitterNGPU.cxx
diff --git a/Common/DCAFitter/GPU/cuda/DCAFitterN.cu b/Common/DCAFitter/GPU/cuda/DCAFitterN.cu
index 28a969f25dee3..ab53ae25d7548 100644
--- a/Common/DCAFitter/GPU/cuda/DCAFitterN.cu
+++ b/Common/DCAFitter/GPU/cuda/DCAFitterN.cu
@@ -15,6 +15,8 @@
 #include <cuda.h>
 #endif
 
+#include <numeric>
+
 #include "GPUCommonDef.h"
 #include "DCAFitter/DCAFitterN.h"
 #include "DeviceInterface/GPUInterface.h"
@@ -133,23 +135,25 @@ int process(const int nBlocks,
 template <typename Fitter, class... Tr>
 void processBulk(const int nBlocks,
                  const int nThreads,
-                 const int nStreams,
+                 const int nBatches,
                  std::vector<Fitter>& fitters,
                  std::vector<int>& results,
                  std::vector<Tr>&... args)
 {
   auto* gpuInterface = GPUInterface::Instance();
-  kernel::warmUpGpuKernel<<<1, 1>>>();
+  kernel::warmUpGpuKernel<<<1, 1, 0, gpuInterface->getNextStream()>>>();
 
   // Benchmarking events
-  // std::vector<cudaEvent_t> start(nStreams), stop(nStreams);
-  // cudaEvent_t totalStart, totalStop;
-  // gpuCheckError(cudaEventCreate(&totalStart));
-  // gpuCheckError(cudaEventCreate(&totalStop));
-  // for (int iBatch{0}; iBatch < nStreams; ++iBatch) {
-  //   gpuCheckError(cudaEventCreate(&start[iBatch]));
-  //   gpuCheckError(cudaEventCreate(&stop[iBatch]));
-  // }
+  std::vector<float> ioUp(nBatches), ioDown(nBatches), kerElapsed(nBatches);
+  std::vector<cudaEvent_t> startIOUp(nBatches), endIOUp(nBatches), startIODown(nBatches), endIODown(nBatches), startKer(nBatches), endKer(nBatches);
+  for (int iBatch{0}; iBatch < nBatches; ++iBatch) {
+    gpuCheckError(cudaEventCreate(&startIOUp[iBatch]));
+    gpuCheckError(cudaEventCreate(&endIOUp[iBatch]));
+    gpuCheckError(cudaEventCreate(&startIODown[iBatch]));
+    gpuCheckError(cudaEventCreate(&endIODown[iBatch]));
+    gpuCheckError(cudaEventCreate(&startKer[iBatch]));
+    gpuCheckError(cudaEventCreate(&endKer[iBatch]));
+  }
 
   // Tracks
   std::array<o2::track::TrackParCov*, Fitter::getNProngs()> tracks_device;
@@ -171,16 +175,17 @@ void processBulk(const int nBlocks,
   int* results_device;
   gpuInterface->allocDevice(reinterpret_cast<void**>(&results_device), sizeof(int) * fitters.size());
 
-  // gpuCheckError(cudaEventRecord(totalStart));
+  // R.R. Computation
   int totalSize = fitters.size();
-  int batchSize = totalSize / nStreams;
-  int remainder = totalSize % nStreams;
+  int batchSize = totalSize / nBatches;
+  int remainder = totalSize % nBatches;
 
-  for (int iBatch{0}; iBatch < nStreams; ++iBatch) {
+  for (int iBatch{0}; iBatch < nBatches; ++iBatch) {
     auto& stream = gpuInterface->getNextStream();
     auto offset = iBatch * batchSize + std::min(iBatch, remainder);
     auto nFits = batchSize + (iBatch < remainder ? 1 : 0);
 
+    gpuCheckError(cudaEventRecord(startIOUp[iBatch], stream));
     gpuCheckError(cudaMemcpyAsync(fitters_device + offset, fitters.data() + offset, sizeof(Fitter) * nFits, cudaMemcpyHostToDevice, stream));
     iArg = 0;
     ([&] {
@@ -188,23 +193,27 @@ void processBulk(const int nBlocks,
       ++iArg;
     }(),
      ...);
-    // gpuCheckError(cudaEventRecord(start[iBatch]));
+    gpuCheckError(cudaEventRecord(endIOUp[iBatch], stream));
+
+    gpuCheckError(cudaEventRecord(startKer[iBatch], stream));
     std::apply([&](auto&&... args) { kernel::processBatchKernel<<<nBlocks, nThreads, 0, stream>>>(fitters_device, results_device, offset, nFits, args...); }, tracks_device);
-    // gpuCheckError(cudaEventRecord(stop[iBatch]));
+    gpuCheckError(cudaEventRecord(endKer[iBatch], stream));
 
     gpuCheckError(cudaPeekAtLastError());
-    gpuCheckError(cudaStreamSynchronize(stream));
     iArg = 0;
+    gpuCheckError(cudaEventRecord(startIODown[iBatch], stream));
     ([&] {
       gpuCheckError(cudaMemcpyAsync(args.data() + offset, tracks_device[iArg] + offset, sizeof(Tr) * nFits, cudaMemcpyDeviceToHost, stream));
       ++iArg;
     }(),
      ...);
+
     gpuCheckError(cudaMemcpyAsync(fitters.data() + offset, fitters_device + offset, sizeof(Fitter) * nFits, cudaMemcpyDeviceToHost, stream));
     gpuCheckError(cudaMemcpyAsync(results.data() + offset, results_device + offset, sizeof(int) * nFits, cudaMemcpyDeviceToHost, stream));
+    gpuCheckError(cudaEventRecord(endIODown[iBatch], stream));
   }
+
   ([&] { gpuInterface->unregisterBuffer(args.data()); }(), ...);
-  // gpuCheckError(cudaEventRecord(totalStop));
 
   for (auto* tracksD : tracks_device) {
     gpuInterface->freeDevice(tracksD);
@@ -214,11 +223,30 @@ void processBulk(const int nBlocks,
   gpuInterface->freeDevice(results_device);
   gpuInterface->unregisterBuffer(fitters.data());
   gpuInterface->unregisterBuffer(results.data());
-  // float milliseconds = 0;
-  // gpuCheckError(cudaEventElapsedTime(&milliseconds, start, stop));
 
-  // LOGP(info, "Kernel run in: {} ms using {} blocks and {} threads.", milliseconds, nBlocks, nThreads);
-  // return results;
+  // Do benchmarks
+  gpuCheckError(cudaDeviceSynchronize());
+  for (int iBatch{0}; iBatch < nBatches; ++iBatch) {
+    gpuCheckError(cudaEventElapsedTime(&ioUp[iBatch], startIOUp[iBatch], endIOUp[iBatch]));
+    gpuCheckError(cudaEventElapsedTime(&kerElapsed[iBatch], startKer[iBatch], endKer[iBatch]));
+    gpuCheckError(cudaEventElapsedTime(&ioDown[iBatch], startIODown[iBatch], endIODown[iBatch]));
+  }
+
+  float totalUp = std::accumulate(ioUp.begin(), ioUp.end(), 0.f);
+  float totalDown = std::accumulate(ioDown.begin(), ioDown.end(), 0.f);
+  float totalKernels = std::accumulate(kerElapsed.begin(), kerElapsed.end(), 0.f);
+  LOGP(info, "Config: {} batches, {} blocks, {} threads", nBatches, nBlocks, nThreads);
+  LOGP(info, "Total I/O time: Up {} ms Avg {} ms, Down {} ms Avg {} ms", totalUp, totalUp / float(nBatches), totalDown, totalDown / (float)nBatches);
+  LOGP(info, "Total Kernel time: {} ms Avg {} ms", totalKernels, totalKernels / (float)nBatches);
+
+  for (int iBatch{0}; iBatch < nBatches; ++iBatch) {
+    gpuCheckError(cudaEventDestroy(startIOUp[iBatch]));
+    gpuCheckError(cudaEventDestroy(endIOUp[iBatch]));
+    gpuCheckError(cudaEventDestroy(startIODown[iBatch]));
+    gpuCheckError(cudaEventDestroy(endIODown[iBatch]));
+    gpuCheckError(cudaEventDestroy(startKer[iBatch]));
+    gpuCheckError(cudaEventDestroy(endKer[iBatch]));
+  }
 }
 
 template void processBulk(const int,
diff --git a/Common/DCAFitter/GPU/cuda/GPUInterface.cu b/Common/DCAFitter/GPU/cuda/GPUInterface.cu
index 26529d41b294b..09f9cdc595dcd 100644
--- a/Common/DCAFitter/GPU/cuda/GPUInterface.cu
+++ b/Common/DCAFitter/GPU/cuda/GPUInterface.cu
@@ -9,6 +9,9 @@
 // granted to it by virtue of its status as an Intergovernmental Organization
 // or submit itself to any jurisdiction.
 
+/// \brief Helper interface to the GPU device, meant to be compatible with manual allocation/streams and GPUReconstruction ones.
+/// \author matteo.concas@cern.ch
+
 #ifdef __HIPCC__
 #include "hip/hip_runtime.h"
 #else
@@ -16,6 +19,7 @@
 #endif
 
 #include <iostream>
+#include <cstdlib>
 
 #include "DeviceInterface/GPUInterface.h"
 
@@ -75,7 +79,8 @@ GPUInterface* GPUInterface::sGPUInterface = nullptr;
 GPUInterface* GPUInterface::Instance()
 {
   if (sGPUInterface == nullptr) {
-    sGPUInterface = new GPUInterface(8); // FIXME: get some configurable param to do so.
+    const auto* envValue = std::getenv("GPUINTERFACE_NSTREAMS");
+    sGPUInterface = new GPUInterface(envValue == nullptr ? 8 : std::stoi(envValue));
   }
   return sGPUInterface;
 }
diff --git a/Common/DCAFitter/GPU/cuda/test/testDCAFitterNGPU.cxx b/Common/DCAFitter/GPU/cuda/test/testDCAFitterNGPU.cxx
index 56da8e173bcf0..a7254931737cb 100644
--- a/Common/DCAFitter/GPU/cuda/test/testDCAFitterNGPU.cxx
+++ b/Common/DCAFitter/GPU/cuda/test/testDCAFitterNGPU.cxx
@@ -23,11 +23,6 @@
 #include <Math/SVector.h>
 #include <array>
 
-#define nBlocks 30
-#define nThreads 256
-#define nStreams 8
-#define NTest 100001
-
 namespace o2
 {
 namespace vertexing
@@ -232,11 +227,11 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngs)
     meanDAW /= nfoundAW ? nfoundA : 1;
     meanDW /= nfoundW ? nfoundW : 1;
     LOG(info) << "Processed " << NTest << " 2-prong vertices Helix : Helix";
-    LOG(info) << "2-prongs with abs.dist minization: eff= " << float(nfoundA) / NTest
+    LOG(info) << "2-prongs with abs.dist minimization: eff = " << float(nfoundA) / NTest
               << " mean.dist to truth: " << meanDA << " Total time: " << swA.CpuTime() * 1000 << " ms";
-    LOG(info) << "2-prongs with abs.dist but wghPCA: eff= " << float(nfoundAW) / NTest
+    LOG(info) << "2-prongs with abs.dist but wghPCA: eff = " << float(nfoundAW) / NTest
               << " mean.dist to truth: " << meanDAW << " Total time: " << swAW.CpuTime() * 1000 << " ms";
-    LOG(info) << "2-prongs with wgh.dist minization: eff= " << float(nfoundW) / NTest
+    LOG(info) << "2-prongs with wgh.dist minimization: eff = " << float(nfoundW) / NTest
               << " mean.dist to truth: " << meanDW << " Total time: " << swW.CpuTime() * 1000 << " ms";
     BOOST_CHECK(nfoundA > 0.99 * NTest);
     BOOST_CHECK(nfoundAW > 0.99 * NTest);
@@ -310,11 +305,11 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngs)
     meanDAW /= nfoundA ? nfoundA : 1;
     meanDW /= nfoundW ? nfoundW : 1;
     LOG(info) << "Processed " << NTest << " 2-prong vertices Helix : Helix from gamma conversion";
-    LOG(info) << "2-prongs with abs.dist minization: eff= " << float(nfoundA) / NTest
+    LOG(info) << "2-prongs with abs.dist minimization: eff = " << float(nfoundA) / NTest
               << " mean.dist to truth: " << meanDA << " Total time: " << swA.CpuTime();
-    LOG(info) << "2-prongs with abs.dist but wghPCA: eff= " << float(nfoundAW) / NTest
+    LOG(info) << "2-prongs with abs.dist but wghPCA: eff = " << float(nfoundAW) / NTest
               << " mean.dist to truth: " << meanDAW << " Total time: " << swAW.CpuTime();
-    LOG(info) << "2-prongs with wgh.dist minization: eff= " << float(nfoundW) / NTest
+    LOG(info) << "2-prongs with wgh.dist minimization: eff = " << float(nfoundW) / NTest
               << " mean.dist to truth: " << meanDW << " Total time: " << swW.CpuTime();
     BOOST_CHECK(nfoundA > 0.99 * NTest);
     BOOST_CHECK(nfoundAW > 0.99 * NTest);
@@ -388,11 +383,11 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngs)
     meanDAW /= nfoundAW ? nfoundAW : 1;
     meanDW /= nfoundW ? nfoundW : 1;
     LOG(info) << "Processed " << NTest << " 2-prong vertices: Helix : Line";
-    LOG(info) << "2-prongs with abs.dist minization: eff= " << float(nfoundA) / NTest
+    LOG(info) << "2-prongs with abs.dist minimization: eff = " << float(nfoundA) / NTest
               << " mean.dist to truth: " << meanDA << " Total time: " << swA.CpuTime();
-    LOG(info) << "2-prongs with abs.dist but wghPCA: eff= " << float(nfoundAW) / NTest
+    LOG(info) << "2-prongs with abs.dist but wghPCA: eff = " << float(nfoundAW) / NTest
               << " mean.dist to truth: " << meanDAW << " Total time: " << swAW.CpuTime();
-    LOG(info) << "2-prongs with wgh.dist minization: eff= " << float(nfoundW) / NTest
+    LOG(info) << "2-prongs with wgh.dist minimization: eff = " << float(nfoundW) / NTest
               << " mean.dist to truth: " << meanDW << " Total time: " << swW.CpuTime();
     BOOST_CHECK(nfoundA > 0.99 * NTest);
     BOOST_CHECK(nfoundAW > 0.99 * NTest);
@@ -465,11 +460,11 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngs)
     meanDAW /= nfoundAW ? nfoundAW : 1;
     meanDW /= nfoundW ? nfoundW : 1;
     LOG(info) << "Processed " << NTest << " 2-prong vertices: Line : Line";
-    LOG(info) << "2-prongs with abs.dist minization: eff= " << float(nfoundA) / NTest
+    LOG(info) << "2-prongs with abs.dist minimization: eff = " << float(nfoundA) / NTest
               << " mean.dist to truth: " << meanDA << " Total time: " << swA.CpuTime();
-    LOG(info) << "2-prongs with abs.dist but wghPCA: eff= " << float(nfoundAW) / NTest
+    LOG(info) << "2-prongs with abs.dist but wghPCA: eff = " << float(nfoundAW) / NTest
               << " mean.dist to truth: " << meanDAW << " Total time: " << swAW.CpuTime();
-    LOG(info) << "2-prongs with wgh.dist minization: eff= " << float(nfoundW) / NTest
+    LOG(info) << "2-prongs with wgh.dist minimization: eff = " << float(nfoundW) / NTest
               << " mean.dist to truth: " << meanDW << " Total time: " << swW.CpuTime();
     BOOST_CHECK(nfoundA > 0.99 * NTest);
     BOOST_CHECK(nfoundAW > 0.99 * NTest);
@@ -541,11 +536,11 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngs)
     meanDAW /= nfoundAW ? nfoundAW : 1;
     meanDW /= nfoundW ? nfoundW : 1;
     LOG(info) << "Processed " << NTest << " 3-prong vertices";
-    LOG(info) << "3-prongs with abs.dist minization: eff= " << float(nfoundA) / NTest
+    LOG(info) << "3-prongs with abs.dist minimization: eff = " << float(nfoundA) / NTest
               << " mean.dist to truth: " << meanDA << " Total time: " << swA.CpuTime();
-    LOG(info) << "3-prongs with abs.dist but wghPCA: eff= " << float(nfoundAW) / NTest
+    LOG(info) << "3-prongs with abs.dist but wghPCA: eff = " << float(nfoundAW) / NTest
               << " mean.dist to truth: " << meanDAW << " Total time: " << swAW.CpuTime();
-    LOG(info) << "3-prongs with wgh.dist minization: eff= " << float(nfoundW) / NTest
+    LOG(info) << "3-prongs with wgh.dist minimization: eff = " << float(nfoundW) / NTest
               << " mean.dist to truth: " << meanDW << " Total time: " << swW.CpuTime();
     BOOST_CHECK(nfoundA > 0.99 * NTest);
     BOOST_CHECK(nfoundAW > 0.99 * NTest);
@@ -561,8 +556,15 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngs)
 
 BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk)
 {
-  // gRandom->Delete();
-  // gRandom = new TRandom(42);
+  const char* nThreadsEnvVarName = "DCAFITTERGPU_TEST_NTHREADS";
+  const char* nBlocksEnvVarName = "DCAFITTERGPU_TEST_NBLOCKS";
+  const char* nBatchesEnvVarName = "DCAFITTERGPU_TEST_NBATCHES";
+  const char* nTestsEnvVarName = "DCAFITTERGPU_TEST_NTESTS";
+  int nBlocks = std::getenv(nThreadsEnvVarName) == nullptr ? 30 : std::stoi(std::getenv(nThreadsEnvVarName));
+  int nThreads = std::getenv(nBlocksEnvVarName) == nullptr ? 256 : std::stoi(std::getenv(nBlocksEnvVarName));
+  int nBatches = std::getenv(nBatchesEnvVarName) == nullptr ? 8 : std::stoi(std::getenv(nBatchesEnvVarName));
+  int NTest = std::getenv(nTestsEnvVarName) == nullptr ? 100001 : std::stoi(std::getenv(nTestsEnvVarName));
+
   o2::utils::TreeStreamRedirector outStreamB("dcafitterNTestBulk.root");
 
   TGenPhaseSpace genPHS;
@@ -614,7 +616,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk)
 
     swAb.Start(false);
     std::vector<int> ncAb(NTest, 0);
-    device::processBulk(nBlocks, nThreads, nStreams, fitters_host, ncAb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
+    device::processBulk(nBlocks, nThreads, nBatches, fitters_host, ncAb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
     swAb.Stop();
 
     for (int iev = 0; iev < NTest; iev++) {
@@ -631,7 +633,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk)
     std::fill(fitters_host.begin(), fitters_host.end(), ft);
     swAWb.Start(false);
     std::vector<int> ncAWb(NTest, 0);
-    device::processBulk(nBlocks, nThreads, nStreams, fitters_host, ncAWb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
+    device::processBulk(nBlocks, nThreads, nBatches, fitters_host, ncAWb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
     swAWb.Stop();
 
     for (int iev = 0; iev < NTest; iev++) {
@@ -648,7 +650,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk)
     std::fill(fitters_host.begin(), fitters_host.end(), ft);
     swWb.Start(false);
     std::vector<int> ncWb(NTest, 0);
-    device::processBulk(nBlocks, nThreads, nStreams, fitters_host, ncWb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
+    device::processBulk(nBlocks, nThreads, nBatches, fitters_host, ncWb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
     swWb.Stop();
 
     for (int iev = 0; iev < NTest; iev++) {
@@ -664,11 +666,11 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk)
     meanDAWb /= nfoundAWb ? nfoundAWb : 1;
     meanDWb /= nfoundWb ? nfoundWb : 1;
     LOGP(info, "Bulk-processed {} 2-prong vertices Helix : Helix", NTest);
-    LOG(info) << "2-prongs with abs.dist minization: eff= " << float(nfoundAb) / NTest
+    LOG(info) << "2-prongs with abs.dist minimization: eff = " << float(nfoundAb) / NTest
               << " mean.dist to truth: " << meanDAb << " Total time: " << swAb.CpuTime() * 1000 << " ms";
-    LOG(info) << "2-prongs with abs.dist but wghPCA: eff= " << float(nfoundAWb) / NTest
+    LOG(info) << "2-prongs with abs.dist but wghPCA: eff = " << float(nfoundAWb) / NTest
               << " mean.dist to truth: " << meanDAWb << " Total time: " << swAWb.CpuTime() * 1000 << " ms";
-    LOG(info) << "2-prongs with wgh.dist minization: eff= " << float(nfoundWb) / NTest
+    LOG(info) << "2-prongs with wgh.dist minimization: eff = " << float(nfoundWb) / NTest
               << " mean.dist to truth: " << meanDWb << " Total time: " << swWb.CpuTime() * 1000 << " ms";
     BOOST_CHECK(nfoundAb > 0.99 * NTest);
     BOOST_CHECK(nfoundAWb > 0.99 * NTest);
@@ -713,7 +715,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk)
 
     swAb.Start(false);
     std::vector<int> ncAb(NTest, 0);
-    device::processBulk(nBlocks, nThreads, nStreams, fitters_host, ncAb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
+    device::processBulk(nBlocks, nThreads, nBatches, fitters_host, ncAb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
     swAb.Stop();
 
     for (int iev = 0; iev < NTest; iev++) {
@@ -730,7 +732,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk)
     std::fill(fitters_host.begin(), fitters_host.end(), ft);
     swAWb.Start(false);
     std::vector<int> ncAWb(NTest, 0);
-    device::processBulk(nBlocks, nThreads, nStreams, fitters_host, ncAWb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
+    device::processBulk(nBlocks, nThreads, nBatches, fitters_host, ncAWb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
     swAWb.Stop();
 
     for (int iev = 0; iev < NTest; iev++) {
@@ -747,7 +749,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk)
     std::fill(fitters_host.begin(), fitters_host.end(), ft);
     swWb.Start(false);
     std::vector<int> ncWb(NTest, 0);
-    device::processBulk(nBlocks, nThreads, nStreams, fitters_host, ncWb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
+    device::processBulk(nBlocks, nThreads, nBatches, fitters_host, ncWb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
     swWb.Stop();
 
     for (int iev = 0; iev < NTest; iev++) {
@@ -764,11 +766,11 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk)
     meanDAWb /= nfoundAWb ? nfoundAWb : 1;
     meanDWb /= nfoundWb ? nfoundWb : 1;
     LOGP(info, "Bulk-processed {} 2-prong vertices Helix : Helix from gamma conversion", NTest);
-    LOG(info) << "2-prongs with abs.dist minization: eff= " << float(nfoundAb) / NTest
+    LOG(info) << "2-prongs with abs.dist minimization: eff = " << float(nfoundAb) / NTest
               << " mean.dist to truth: " << meanDAb << " Total time: " << swAb.CpuTime() * 1000 << " ms";
-    LOG(info) << "2-prongs with abs.dist but wghPCA: eff= " << float(nfoundAWb) / NTest
+    LOG(info) << "2-prongs with abs.dist but wghPCA: eff = " << float(nfoundAWb) / NTest
               << " mean.dist to truth: " << meanDAWb << " Total time: " << swAWb.CpuTime() * 1000 << " ms";
-    LOG(info) << "2-prongs with wgh.dist minization: eff= " << float(nfoundWb) / NTest
+    LOG(info) << "2-prongs with wgh.dist minimization: eff = " << float(nfoundWb) / NTest
               << " mean.dist to truth: " << meanDWb << " Total time: " << swWb.CpuTime() * 1000 << " ms";
     BOOST_CHECK(nfoundAb > 0.99 * NTest);
     BOOST_CHECK(nfoundAWb > 0.99 * NTest);
@@ -814,7 +816,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk)
 
     swAb.Start(false);
     std::vector<int> ncAb(NTest, 0);
-    device::processBulk(nBlocks, nThreads, nStreams, fitters_host, ncAb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
+    device::processBulk(nBlocks, nThreads, nBatches, fitters_host, ncAb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
     swAb.Stop();
 
     for (int iev = 0; iev < NTest; iev++) {
@@ -831,7 +833,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk)
     std::fill(fitters_host.begin(), fitters_host.end(), ft);
     swAWb.Start(false);
     std::vector<int> ncAWb(NTest, 0);
-    device::processBulk(nBlocks, nThreads, nStreams, fitters_host, ncAWb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
+    device::processBulk(nBlocks, nThreads, nBatches, fitters_host, ncAWb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
     swAWb.Stop();
 
     for (int iev = 0; iev < NTest; iev++) {
@@ -848,7 +850,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk)
     std::fill(fitters_host.begin(), fitters_host.end(), ft);
     swWb.Start(false);
     std::vector<int> ncWb(NTest, 0);
-    device::processBulk(nBlocks, nThreads, nStreams, fitters_host, ncWb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
+    device::processBulk(nBlocks, nThreads, nBatches, fitters_host, ncWb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
     swWb.Stop();
 
     for (int iev = 0; iev < NTest; iev++) {
@@ -865,11 +867,11 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk)
     meanDAWb /= nfoundAWb ? nfoundAWb : 1;
     meanDWb /= nfoundWb ? nfoundWb : 1;
     LOG(info) << "Bulk-processed " << NTest << " 2-prong vertices: Helix : Line";
-    LOG(info) << "2-prongs with abs.dist minization: eff= " << float(nfoundAb) / NTest
+    LOG(info) << "2-prongs with abs.dist minimization: eff = " << float(nfoundAb) / NTest
               << " mean.dist to truth: " << meanDAb << " Total time: " << swAb.CpuTime() * 1000 << " ms";
-    LOG(info) << "2-prongs with abs.dist but wghPCA: eff= " << float(nfoundAWb) / NTest
+    LOG(info) << "2-prongs with abs.dist but wghPCA: eff = " << float(nfoundAWb) / NTest
               << " mean.dist to truth: " << meanDAWb << " Total time: " << swAWb.CpuTime() * 1000 << " ms";
-    LOG(info) << "2-prongs with wgh.dist minization: eff= " << float(nfoundWb) / NTest
+    LOG(info) << "2-prongs with wgh.dist minimization: eff = " << float(nfoundWb) / NTest
               << " mean.dist to truth: " << meanDWb << " Total time: " << swWb.CpuTime() * 1000 << " ms";
     BOOST_CHECK(nfoundAb > 0.99 * NTest);
     BOOST_CHECK(nfoundAWb > 0.99 * NTest);
@@ -914,7 +916,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk)
 
     swAb.Start(false);
     std::vector<int> ncAb(NTest, 0);
-    device::processBulk(nBlocks, nThreads, nStreams, fitters_host, ncAb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
+    device::processBulk(nBlocks, nThreads, nBatches, fitters_host, ncAb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
     swAb.Stop();
 
     for (int iev = 0; iev < NTest; iev++) {
@@ -931,7 +933,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk)
     std::fill(fitters_host.begin(), fitters_host.end(), ft);
     swAWb.Start(false);
     std::vector<int> ncAWb(NTest, 0);
-    device::processBulk(nBlocks, nThreads, nStreams, fitters_host, ncAWb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
+    device::processBulk(nBlocks, nThreads, nBatches, fitters_host, ncAWb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
     swAWb.Stop();
     for (int iev = 0; iev < NTest; iev++) {
       LOG(debug) << "fit abs.dist " << iev << " NC: " << ncAWb[iev] << " Chi2: " << (ncAWb[iev] ? fitters_host[iev].getChi2AtPCACandidate(0) : -1);
@@ -948,7 +950,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk)
 
     swWb.Start(false);
     std::vector<int> ncWb(NTest, 0);
-    device::processBulk(nBlocks, nThreads, nStreams, fitters_host, ncWb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
+    device::processBulk(nBlocks, nThreads, nBatches, fitters_host, ncWb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
     swWb.Stop();
 
     for (int iev = 0; iev < NTest; iev++) {
@@ -964,11 +966,11 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk)
     meanDAWb /= nfoundAWb ? nfoundAWb : 1;
     meanDWb /= nfoundWb ? nfoundWb : 1;
     LOG(info) << "Bulk-processed " << NTest << " 2-prong vertices: Line : Line";
-    LOG(info) << "2-prongs with abs.dist minization: eff= " << float(nfoundAb) / NTest
+    LOG(info) << "2-prongs with abs.dist minimization: eff = " << float(nfoundAb) / NTest
               << " mean.dist to truth: " << meanDAb << " Total time: " << swAb.CpuTime() * 1000 << " ms";
-    LOG(info) << "2-prongs with abs.dist but wghPCA: eff= " << float(nfoundAWb) / NTest
+    LOG(info) << "2-prongs with abs.dist but wghPCA: eff = " << float(nfoundAWb) / NTest
               << " mean.dist to truth: " << meanDAWb << " Total time: " << swAWb.CpuTime() * 1000 << " ms";
-    LOG(info) << "2-prongs with wgh.dist minization: eff= " << float(nfoundWb) / NTest
+    LOG(info) << "2-prongs with wgh.dist minimization: eff = " << float(nfoundWb) / NTest
               << " mean.dist to truth: " << meanDWb << " Total time: " << swWb.CpuTime() * 1000 << " ms";
     BOOST_CHECK(nfoundAb > 0.99 * NTest);
     BOOST_CHECK(nfoundAWb > 0.99 * NTest);
@@ -1014,7 +1016,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk)
     std::fill(fitters_host.begin(), fitters_host.end(), ft);
     swAb.Start(false);
     std::vector<int> ncAb(NTest, 0);
-    device::processBulk(nBlocks, nThreads, nStreams, fitters_host, ncAb, vctracks[0], vctracks[1], vctracks[2]); // HERE WE FIT THE VERTICES
+    device::processBulk(nBlocks, nThreads, nBatches, fitters_host, ncAb, vctracks[0], vctracks[1], vctracks[2]); // HERE WE FIT THE VERTICES
     swAb.Stop();
     for (int iev = 0; iev < NTest; iev++) {
       LOG(debug) << "fit abs.dist " << iev << " NC: " << ncAb[iev] << " Chi2: " << (ncAb[iev] ? fitters_host[iev].getChi2AtPCACandidate(0) : -1);
@@ -1031,7 +1033,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk)
 
     swAWb.Start(false);
     std::vector<int> ncAWb(NTest, 0);
-    device::processBulk(nBlocks, nThreads, nStreams, fitters_host, ncAWb, vctracks[0], vctracks[1], vctracks[2]); // HERE WE FIT THE VERTICES
+    device::processBulk(nBlocks, nThreads, nBatches, fitters_host, ncAWb, vctracks[0], vctracks[1], vctracks[2]); // HERE WE FIT THE VERTICES
     swAWb.Stop();
     for (int iev = 0; iev < NTest; iev++) {
       LOG(debug) << "fit abs.dist " << iev << " NC: " << ncAWb[iev] << " Chi2: " << (ncAWb[iev] ? fitters_host[iev].getChi2AtPCACandidate(0) : -1);
@@ -1048,7 +1050,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk)
 
     swWb.Start(false);
     std::vector<int> ncWb(NTest, 0);
-    device::processBulk(nBlocks, nThreads, nStreams, fitters_host, ncWb, vctracks[0], vctracks[1], vctracks[2]); // HERE WE FIT THE VERTICES
+    device::processBulk(nBlocks, nThreads, nBatches, fitters_host, ncWb, vctracks[0], vctracks[1], vctracks[2]); // HERE WE FIT THE VERTICES
     swWb.Stop();
     for (int iev = 0; iev < NTest; iev++) {
       LOG(debug) << "fit wgh.dist " << iev << " NC: " << ncWb[iev] << " Chi2: " << (ncWb[iev] ? fitters_host[iev].getChi2AtPCACandidate(0) : -1);
@@ -1064,11 +1066,11 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk)
     meanDAWb /= nfoundAWb ? nfoundAWb : 1;
     meanDWb /= nfoundWb ? nfoundWb : 1;
     LOG(info) << "Bulk-processed " << NTest << " 3-prong vertices";
-    LOG(info) << "3-prongs with abs.dist minization: eff= " << float(nfoundAb) / NTest
+    LOG(info) << "3-prongs with abs.dist minimization: eff = " << float(nfoundAb) / NTest
               << " mean.dist to truth: " << meanDAb << " Total time: " << swAb.CpuTime() * 1000 << " ms";
-    LOG(info) << "3-prongs with abs.dist but wghPCA: eff= " << float(nfoundAWb) / NTest
+    LOG(info) << "3-prongs with abs.dist but wghPCA: eff = " << float(nfoundAWb) / NTest
               << " mean.dist to truth: " << meanDAWb << " Total time: " << swAWb.CpuTime() * 1000 << " ms";
-    LOG(info) << "3-prongs with wgh.dist minization: eff= " << float(nfoundWb) / NTest
+    LOG(info) << "3-prongs with wgh.dist minimization: eff = " << float(nfoundWb) / NTest
               << " mean.dist to truth: " << meanDWb << " Total time: " << swWb.CpuTime() * 1000 << " ms";
     BOOST_CHECK(nfoundAb > 0.99 * NTest);
     BOOST_CHECK(nfoundAWb > 0.99 * NTest);
diff --git a/Common/DCAFitter/GPU/hip/CMakeLists.txt b/Common/DCAFitter/GPU/hip/CMakeLists.txt
index a804775d051c7..f62759bb6ea2c 100644
--- a/Common/DCAFitter/GPU/hip/CMakeLists.txt
+++ b/Common/DCAFitter/GPU/hip/CMakeLists.txt
@@ -20,7 +20,7 @@ o2_add_hipified_library(DCAFitterHIP
                                               hip::host
                         PRIVATE_LINK_LIBRARIES O2::GPUTrackingHIPExternalProvider
                         TARGETVARNAME targetNAme)
-                          
+
 o2_add_test(DCAFitterNHIP
             SOURCES ../cuda/test/testDCAFitterNGPU.cxx
             PUBLIC_LINK_LIBRARIES O2::ReconstructionDataFormats