diff --git a/Common/DCAFitter/GPU/cuda/CMakeLists.txt b/Common/DCAFitter/GPU/cuda/CMakeLists.txt
index 149cef4a8a843..a498d0c350202 100644
--- a/Common/DCAFitter/GPU/cuda/CMakeLists.txt
+++ b/Common/DCAFitter/GPU/cuda/CMakeLists.txt
@@ -8,7 +8,7 @@
 # In applying this license CERN does not waive the privileges and immunities
 # granted to it by virtue of its status as an Intergovernmental Organization
 # or submit itself to any jurisdiction.
-add_compile_definitions(GPUCA_GPU_DEBUG_PRINT)
+
 o2_add_library(DCAFitterCUDA
                TARGETVARNAME targetName
                SOURCES DCAFitterN.cu
diff --git a/Common/DCAFitter/GPU/cuda/DCAFitterN.cu b/Common/DCAFitter/GPU/cuda/DCAFitterN.cu
index 31fc34614e2ec..e3896bf6de89c 100644
--- a/Common/DCAFitter/GPU/cuda/DCAFitterN.cu
+++ b/Common/DCAFitter/GPU/cuda/DCAFitterN.cu
@@ -36,113 +36,84 @@ namespace o2::vertexing::device
 {
 namespace kernel
 {
-GPUg() void printKernel(o2::vertexing::DCAFitterN<2>* ft)
+template <int N = 2>
+GPUg() void printKernel(o2::vertexing::DCAFitterN<N>* ft)
 {
   if (threadIdx.x == 0) {
-    printf(" =============== GPU DCA Fitter ================\n");
+    printf(" =============== GPU DCA Fitter %d prongs ================\n", N);
     ft->print();
-    printf(" ===============================================\n");
+    printf(" =========================================================\n\n");
   }
 }
 
-GPUg() void processKernel(o2::vertexing::DCAFitterN<2>* ft, o2::track::TrackParCov* t1, o2::track::TrackParCov* t2, int* res)
+template <typename Fitter, typename... Tr>
+GPUg() void processKernel(Fitter* ft, int* res, Tr*... tracks)
 {
-  *res = ft->process(*t1, *t2);
+  *res = ft->process(*tracks...);
 }
 } // namespace kernel
 
-void print(o2::vertexing::DCAFitterN<2>& ft,
-           const int nBlocks,
-           const int nThreads)
+/// CPU handlers
+template <typename Fitter>
+void print(const int nBlocks,
+           const int nThreads,
+           Fitter& ft)
 {
-  DCAFitterN<2>* ft_device;
-  gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&ft_device), sizeof(o2::vertexing::DCAFitterN<2>)));
-  gpuCheckError(cudaMemcpy(ft_device, &ft, sizeof(o2::vertexing::DCAFitterN<2>), cudaMemcpyHostToDevice));
+  Fitter* ft_device;
+  gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&ft_device), sizeof(Fitter)));
+  gpuCheckError(cudaMemcpy(ft_device, &ft, sizeof(Fitter), cudaMemcpyHostToDevice));
 
-  kernel::printKernel<<<nBlocks, nThreads>>>(ft_device);
+  kernel::printKernel<Fitter::getNProngs()><<<nBlocks, nThreads>>>(ft_device);
 
   gpuCheckError(cudaPeekAtLastError());
   gpuCheckError(cudaDeviceSynchronize());
 }
 
-int process(o2::vertexing::DCAFitterN<2>& fitter,
-            o2::track::TrackParCov& track1,
-            o2::track::TrackParCov& track2,
-            const int nBlocks,
-            const int nThreads)
-{
-  DCAFitterN<2>* ft_device;
-  o2::track::TrackParCov* t1_device;
-  o2::track::TrackParCov* t2_device;
-  int result, *result_device;
-
-  gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&ft_device), sizeof(o2::vertexing::DCAFitterN<2>)));
-  gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&t1_device), sizeof(o2::track::TrackParCov)));
-  gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&t2_device), sizeof(o2::track::TrackParCov)));
-  gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&result_device), sizeof(int)));
-
-  gpuCheckError(cudaMemcpy(ft_device, &fitter, sizeof(o2::vertexing::DCAFitterN<2>), cudaMemcpyHostToDevice));
-  gpuCheckError(cudaMemcpy(t1_device, &track1, sizeof(o2::track::TrackParCov), cudaMemcpyHostToDevice));
-  gpuCheckError(cudaMemcpy(t2_device, &track2, sizeof(o2::track::TrackParCov), cudaMemcpyHostToDevice));
-
-  kernel::processKernel<<<nBlocks, nThreads>>>(ft_device, t1_device, t2_device, result_device);
-
-  gpuCheckError(cudaPeekAtLastError());
-  gpuCheckError(cudaDeviceSynchronize());
-
-  gpuCheckError(cudaMemcpy(&result, result_device, sizeof(int), cudaMemcpyDeviceToHost));
-  gpuCheckError(cudaMemcpy(&fitter, ft_device, sizeof(o2::vertexing::DCAFitterN<2>), cudaMemcpyDeviceToHost));
-  gpuCheckError(cudaMemcpy(&track1, t1_device, sizeof(o2::track::TrackParCov), cudaMemcpyDeviceToHost));
-  gpuCheckError(cudaMemcpy(&track2, t2_device, sizeof(o2::track::TrackParCov), cudaMemcpyDeviceToHost));
-  gpuCheckError(cudaFree(ft_device));
-  gpuCheckError(cudaFree(t1_device));
-  gpuCheckError(cudaFree(t2_device));
-
-  gpuCheckError(cudaFree(result_device));
-
-  return result;
-}
-
-template <int N, class... Tr>
-int process(o2::vertexing::DCAFitterN<2>&,
-            const int nBlocks = 1,
-            const int nThreads = 1,
+template <typename Fitter, class... Tr>
+int process(const int nBlocks,
+            const int nThreads,
+            Fitter& fitter,
             Tr&... args)
 {
-  DCAFitterN<N>* ft_device;
-  std::array<o2::track::TrackParCov*, N> tracks_device;
-  // o2::track::TrackParCov* t1_device;
-  // o2::track::TrackParCov* t2_device;
+  Fitter* ft_device;
+  std::array<o2::track::TrackParCov*, Fitter::getNProngs()> tracks_device;
   int result, *result_device;
 
-  gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&ft_device), sizeof(o2::vertexing::DCAFitterN<N>)));
-  // gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&t1_device), sizeof(o2::track::TrackParCov)));
-  // gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&t2_device), sizeof(o2::track::TrackParCov)));
-  for (int iT{0}; iT < N; ++iT) {
-    gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&(tracks_device[iT])), sizeof(o2::track::TrackParCov)));
-  }
+  gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&ft_device), sizeof(Fitter)));
   gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&result_device), sizeof(int)));
 
-  gpuCheckError(cudaMemcpy(ft_device, &fitter, sizeof(o2::vertexing::DCAFitterN<2>), cudaMemcpyHostToDevice));
-  gpuCheckError(cudaMemcpy(t1_device, &track1, sizeof(o2::track::TrackParCov), cudaMemcpyHostToDevice));
-  gpuCheckError(cudaMemcpy(t2_device, &track2, sizeof(o2::track::TrackParCov), cudaMemcpyHostToDevice));
+  int iArg{0};
+  ([&] {
+    gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&(tracks_device[iArg])), sizeof(o2::track::TrackParCov)));
+    gpuCheckError(cudaMemcpy(tracks_device[iArg], &args, sizeof(o2::track::TrackParCov), cudaMemcpyHostToDevice));
+    ++iArg;
+  }(),
+   ...);
+
+  gpuCheckError(cudaMemcpy(ft_device, &fitter, sizeof(Fitter), cudaMemcpyHostToDevice));
 
-  kernel::processKernel<<<nBlocks, nThreads>>>(ft_device, t1_device, t2_device, result_device);
+  std::apply([&](auto&&... args) { kernel::processKernel<<<nBlocks, nThreads>>>(ft_device, result_device, args...); }, tracks_device);
 
   gpuCheckError(cudaPeekAtLastError());
   gpuCheckError(cudaDeviceSynchronize());
 
   gpuCheckError(cudaMemcpy(&result, result_device, sizeof(int), cudaMemcpyDeviceToHost));
-  gpuCheckError(cudaMemcpy(&fitter, ft_device, sizeof(o2::vertexing::DCAFitterN<2>), cudaMemcpyDeviceToHost));
-  gpuCheckError(cudaMemcpy(&track1, t1_device, sizeof(o2::track::TrackParCov), cudaMemcpyDeviceToHost));
-  gpuCheckError(cudaMemcpy(&track2, t2_device, sizeof(o2::track::TrackParCov), cudaMemcpyDeviceToHost));
-  gpuCheckError(cudaFree(ft_device));
-  gpuCheckError(cudaFree(t1_device));
-  gpuCheckError(cudaFree(t2_device));
+  gpuCheckError(cudaMemcpy(&fitter, ft_device, sizeof(Fitter), cudaMemcpyDeviceToHost));
+  iArg = 0;
+  ([&] {
+    gpuCheckError(cudaMemcpy(&args, tracks_device[iArg], sizeof(o2::track::TrackParCov), cudaMemcpyDeviceToHost));
+    gpuCheckError(cudaFree(tracks_device[iArg]));
+    ++iArg;
+  }(),
+   ...);
 
   gpuCheckError(cudaFree(result_device));
 
   return result;
 }
 
+template int process(const int, const int, o2::vertexing::DCAFitterN<2>&, o2::track::TrackParCov&, o2::track::TrackParCov&);
+template int process(const int, const int, o2::vertexing::DCAFitterN<3>&, o2::track::TrackParCov&, o2::track::TrackParCov&, o2::track::TrackParCov&);
+template void print(const int, const int, o2::vertexing::DCAFitterN<2>&);
+template void print(const int, const int, o2::vertexing::DCAFitterN<3>&);
 } // namespace o2::vertexing::device
\ No newline at end of file
diff --git a/Common/DCAFitter/GPU/cuda/test/testDCAFitterNGPU.cxx b/Common/DCAFitter/GPU/cuda/test/testDCAFitterNGPU.cxx
index 57172079ccf09..2a2d42105d6b6 100644
--- a/Common/DCAFitter/GPU/cuda/test/testDCAFitterNGPU.cxx
+++ b/Common/DCAFitter/GPU/cuda/test/testDCAFitterNGPU.cxx
@@ -189,7 +189,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngs)
 
       ft.setUseAbsDCA(true);
       swA.Start(false);
-      int ncA = device::process(ft, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
+      int ncA = device::process(1, 1, ft, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
       swA.Stop();
       LOG(debug) << "fit abs.dist " << iev << " NC: " << ncA << " Chi2: " << (ncA ? ft.getChi2AtPCACandidate(0) : -1);
       if (ncA) {
@@ -201,7 +201,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngs)
       ft.setUseAbsDCA(true);
       ft.setWeightedFinalPCA(true);
       swAW.Start(false);
-      int ncAW = device::process(ft, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
+      int ncAW = device::process(1, 1, ft, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
       swAW.Stop();
       LOG(debug) << "fit abs.dist with final weighted DCA " << iev << " NC: " << ncAW << " Chi2: " << (ncAW ? ft.getChi2AtPCACandidate(0) : -1);
       if (ncAW) {
@@ -213,7 +213,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngs)
       ft.setUseAbsDCA(false);
       ft.setWeightedFinalPCA(false);
       swW.Start(false);
-      int ncW = device::process(ft, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
+      int ncW = device::process(1, 1, ft, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
       swW.Stop();
       LOG(debug) << "fit wgh.dist " << iev << " NC: " << ncW << " Chi2: " << (ncW ? ft.getChi2AtPCACandidate(0) : -1);
       if (ncW) {
@@ -222,8 +222,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngs)
         nfoundW++;
       }
     }
-    // ft.print();
-    device::print(ft, 1, 1);
+    device::print(1, 1, ft);
     meanDA /= nfoundA ? nfoundA : 1;
     meanDAW /= nfoundA ? nfoundA : 1;
     meanDW /= nfoundW ? nfoundW : 1;
@@ -268,7 +267,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngs)
 
       ft.setUseAbsDCA(true);
       swA.Start(false);
-      int ncA = device::process(ft, vctracks[0], vctracks[1]);
+      int ncA = device::process(1, 1, ft, vctracks[0], vctracks[1]);
       swA.Stop();
       LOG(debug) << "fit abs.dist " << iev << " NC: " << ncA << " Chi2: " << (ncA ? ft.getChi2AtPCACandidate(0) : -1);
       if (ncA) {
@@ -280,7 +279,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngs)
       ft.setUseAbsDCA(true);
       ft.setWeightedFinalPCA(true);
       swAW.Start(false);
-      int ncAW = device::process(ft, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
+      int ncAW = device::process(1, 1, ft, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
       swAW.Stop();
       LOG(debug) << "fit abs.dist with final weighted DCA " << iev << " NC: " << ncAW << " Chi2: " << (ncAW ? ft.getChi2AtPCACandidate(0) : -1);
       if (ncAW) {
@@ -292,7 +291,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngs)
       ft.setUseAbsDCA(false);
       ft.setWeightedFinalPCA(false);
       swW.Start(false);
-      int ncW = device::process(ft, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
+      int ncW = device::process(1, 1, ft, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
       swW.Stop();
       LOG(debug) << "fit wgh.dist " << iev << " NC: " << ncW << " Chi2: " << (ncW ? ft.getChi2AtPCACandidate(0) : -1);
       if (ncW) {
@@ -302,7 +301,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngs)
       }
     }
 
-    device::print(ft, 1, 1);
+    device::print(1, 1, ft);
     meanDA /= nfoundA ? nfoundA : 1;
     meanDAW /= nfoundA ? nfoundA : 1;
     meanDW /= nfoundW ? nfoundW : 1;
@@ -347,7 +346,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngs)
 
       ft.setUseAbsDCA(true);
       swA.Start(false);
-      int ncA = device::process(ft, vctracks[0], vctracks[1]);
+      int ncA = device::process(1, 1, ft, vctracks[0], vctracks[1]);
       swA.Stop();
       LOG(debug) << "fit abs.dist with final weighted DCA " << iev << " NC: " << ncA << " Chi2: " << (ncA ? ft.getChi2AtPCACandidate(0) : -1);
       if (ncA) {
@@ -359,7 +358,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngs)
       ft.setUseAbsDCA(true);
       ft.setWeightedFinalPCA(true);
       swAW.Start(false);
-      int ncAW = device::process(ft, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
+      int ncAW = device::process(1, 1, ft, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
       swAW.Stop();
       LOG(debug) << "fit abs.dist  " << iev << " NC: " << ncAW << " Chi2: " << (ncAW ? ft.getChi2AtPCACandidate(0) : -1);
       if (ncAW) {
@@ -371,7 +370,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngs)
       ft.setUseAbsDCA(false);
       ft.setWeightedFinalPCA(false);
       swW.Start(false);
-      int ncW = device::process(ft, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
+      int ncW = device::process(1, 1, ft, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
       swW.Stop();
       LOG(debug) << "fit wgh.dist " << iev << " NC: " << ncW << " Chi2: " << (ncW ? ft.getChi2AtPCACandidate(0) : -1);
       if (ncW) {
@@ -380,7 +379,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngs)
         nfoundW++;
       }
     }
-    device::print(ft, 1, 1);
+    device::print(1, 1, ft);
     meanDA /= nfoundA ? nfoundA : 1;
     meanDAW /= nfoundAW ? nfoundAW : 1;
     meanDW /= nfoundW ? nfoundW : 1;
@@ -424,7 +423,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngs)
 
       ft.setUseAbsDCA(true);
       swA.Start(false);
-      int ncA = device::process(ft, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
+      int ncA = device::process(1, 1, ft, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
       swA.Stop();
       LOG(debug) << "fit abs.dist " << iev << " NC: " << ncA << " Chi2: " << (ncA ? ft.getChi2AtPCACandidate(0) : -1);
       if (ncA) {
@@ -436,7 +435,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngs)
       ft.setUseAbsDCA(true);
       ft.setWeightedFinalPCA(true);
       swAW.Start(false);
-      int ncAW = device::process(ft, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
+      int ncAW = device::process(1, 1, ft, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
       swAW.Stop();
       LOG(debug) << "fit abs.dist " << iev << " NC: " << ncAW << " Chi2: " << (ncAW ? ft.getChi2AtPCACandidate(0) : -1);
       if (ncAW) {
@@ -448,7 +447,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngs)
       ft.setUseAbsDCA(false);
       ft.setWeightedFinalPCA(false);
       swW.Start(false);
-      int ncW = device::process(ft, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
+      int ncW = device::process(1, 1, ft, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
       swW.Stop();
       LOG(debug) << "fit wgh.dist " << iev << " NC: " << ncW << " Chi2: " << (ncW ? ft.getChi2AtPCACandidate(0) : -1);
       if (ncW) {
@@ -457,17 +456,17 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngs)
         nfoundW++;
       }
     }
-    device::print(ft, 1, 1);
+    device::print(1, 1, ft);
     meanDA /= nfoundA ? nfoundA : 1;
     meanDAW /= nfoundAW ? nfoundAW : 1;
     meanDW /= nfoundW ? nfoundW : 1;
     LOG(info) << "Processed " << NTest << " 2-prong vertices: Line : Line";
     LOG(info) << "2-prongs with abs.dist minization: eff= " << float(nfoundA) / NTest
-               << " mean.dist to truth: " << meanDA << " CPU time: " << swA.CpuTime();
+              << " mean.dist to truth: " << meanDA << " CPU time: " << swA.CpuTime();
     LOG(info) << "2-prongs with abs.dist but wghPCA: eff= " << float(nfoundAW) / NTest
-               << " mean.dist to truth: " << meanDAW << " CPU time: " << swAW.CpuTime();
+              << " mean.dist to truth: " << meanDAW << " CPU time: " << swAW.CpuTime();
     LOG(info) << "2-prongs with wgh.dist minization: eff= " << float(nfoundW) / NTest
-               << " mean.dist to truth: " << meanDW << " CPU time: " << swW.CpuTime();
+              << " mean.dist to truth: " << meanDW << " CPU time: " << swW.CpuTime();
     BOOST_CHECK(nfoundA > 0.99 * NTest);
     BOOST_CHECK(nfoundAW > 0.99 * NTest);
     BOOST_CHECK(nfoundW > 0.99 * NTest);
@@ -476,81 +475,81 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngs)
     BOOST_CHECK(meanDW < 0.1);
   }
 
-  // // 3 prongs vertices
-  // {
-  //   std::vector<int> forceQ{1, 1, 1};
-
-  //   o2::vertexing::DCAFitterN<3> ft; // 3 prong fitter
-  //   ft.setBz(bz);
-  //   ft.setPropagateToPCA(true);  // After finding the vertex, propagate tracks to the DCA. This is default anyway
-  //   ft.setMaxR(200);             // do not consider V0 seeds with 2D circles crossing above this R. This is default anyway
-  //   ft.setMaxDZIni(4);           // do not consider V0 seeds with tracks Z-distance exceeding this. This is default anyway
-  //   ft.setMinParamChange(1e-3);  // stop iterations if max correction is below this value. This is default anyway
-  //   ft.setMinRelChi2Change(0.9); // stop iterations if chi2 improves by less that this factor
-
-  //   std::string treeName3A = "pr3a", treeName3AW = "pr3aw", treeName3W = "pr3w";
-  //   TStopwatch swA, swAW, swW;
-  //   int nfoundA = 0, nfoundAW = 0, nfoundW = 0;
-  //   double meanDA = 0, meanDAW = 0, meanDW = 0;
-  //   swA.Stop();
-  //   swAW.Stop();
-  //   swW.Stop();
-  //   for (int iev = 0; iev < NTest; iev++) {
-  //     auto genParent = generate(vtxGen, vctracks, bz, genPHS, dch, dchdec, forceQ);
-
-  //     ft.setUseAbsDCA(true);
-  //     swA.Start(false);
-  //     int ncA = ft.process(vctracks[0], vctracks[1], vctracks[2]); // HERE WE FIT THE VERTICES
-  //     swA.Stop();
-  //     LOG(debug) << "fit abs.dist " << iev << " NC: " << ncA << " Chi2: " << (ncA ? ft.getChi2AtPCACandidate(0) : -1);
-  //     if (ncA) {
-  //       auto minD = checkResults(outStream, treeName3A, ft, vtxGen, genParent, dchdec);
-  //       meanDA += minD;
-  //       nfoundA++;
-  //     }
-
-  //     ft.setUseAbsDCA(true);
-  //     ft.setWeightedFinalPCA(true);
-  //     swAW.Start(false);
-  //     int ncAW = ft.process(vctracks[0], vctracks[1], vctracks[2]); // HERE WE FIT THE VERTICES
-  //     swAW.Stop();
-  //     LOG(debug) << "fit abs.dist " << iev << " NC: " << ncAW << " Chi2: " << (ncAW ? ft.getChi2AtPCACandidate(0) : -1);
-  //     if (ncAW) {
-  //       auto minD = checkResults(outStream, treeName3AW, ft, vtxGen, genParent, dchdec);
-  //       meanDAW += minD;
-  //       nfoundAW++;
-  //     }
-
-  //     ft.setUseAbsDCA(false);
-  //     ft.setWeightedFinalPCA(false);
-  //     swW.Start(false);
-  //     int ncW = ft.process(vctracks[0], vctracks[1], vctracks[2]); // HERE WE FIT THE VERTICES
-  //     swW.Stop();
-  //     LOG(debug) << "fit wgh.dist " << iev << " NC: " << ncW << " Chi2: " << (ncW ? ft.getChi2AtPCACandidate(0) : -1);
-  //     if (ncW) {
-  //       auto minD = checkResults(outStream, treeName3W, ft, vtxGen, genParent, dchdec);
-  //       meanDW += minD;
-  //       nfoundW++;
-  //     }
-  //   }
-  //   ft.print();
-  //   meanDA /= nfoundA ? nfoundA : 1;
-  //   meanDAW /= nfoundAW ? nfoundAW : 1;
-  //   meanDW /= nfoundW ? nfoundW : 1;
-  //   LOG(debug) << "Processed " << NTest << " 3-prong vertices";
-  //   LOG(debug) << "3-prongs with abs.dist minization: eff= " << float(nfoundA) / NTest
-  //             << " mean.dist to truth: " << meanDA << " CPU time: " << swA.CpuTime();
-  //   LOG(debug) << "3-prongs with abs.dist but wghPCA: eff= " << float(nfoundAW) / NTest
-  //             << " mean.dist to truth: " << meanDAW << " CPU time: " << swAW.CpuTime();
-  //   LOG(debug) << "3-prongs with wgh.dist minization: eff= " << float(nfoundW) / NTest
-  //             << " mean.dist to truth: " << meanDW << " CPU time: " << swW.CpuTime();
-  //   BOOST_CHECK(nfoundA > 0.99 * NTest);
-  //   BOOST_CHECK(nfoundAW > 0.99 * NTest);
-  //   BOOST_CHECK(nfoundW > 0.99 * NTest);
-  //   BOOST_CHECK(meanDA < 0.1);
-  //   BOOST_CHECK(meanDAW < 0.1);
-  //   BOOST_CHECK(meanDW < 0.1);
-  // }
+  // 3 prongs vertices
+  {
+    std::vector<int> forceQ{1, 1, 1};
+
+    o2::vertexing::DCAFitterN<3> ft; // 3 prong fitter
+    ft.setBz(bz);
+    ft.setPropagateToPCA(true);  // After finding the vertex, propagate tracks to the DCA. This is default anyway
+    ft.setMaxR(200);             // do not consider V0 seeds with 2D circles crossing above this R. This is default anyway
+    ft.setMaxDZIni(4);           // do not consider V0 seeds with tracks Z-distance exceeding this. This is default anyway
+    ft.setMinParamChange(1e-3);  // stop iterations if max correction is below this value. This is default anyway
+    ft.setMinRelChi2Change(0.9); // stop iterations if chi2 improves by less that this factor
+
+    std::string treeName3A = "pr3a", treeName3AW = "pr3aw", treeName3W = "pr3w";
+    TStopwatch swA, swAW, swW;
+    int nfoundA = 0, nfoundAW = 0, nfoundW = 0;
+    double meanDA = 0, meanDAW = 0, meanDW = 0;
+    swA.Stop();
+    swAW.Stop();
+    swW.Stop();
+    for (int iev = 0; iev < NTest; iev++) {
+      auto genParent = generate(vtxGen, vctracks, bz, genPHS, dch, dchdec, forceQ);
+
+      ft.setUseAbsDCA(true);
+      swA.Start(false);
+      int ncA = device::process(1, 1, ft, vctracks[0], vctracks[1], vctracks[2]);
+      swA.Stop();
+      LOG(debug) << "fit abs.dist " << iev << " NC: " << ncA << " Chi2: " << (ncA ? ft.getChi2AtPCACandidate(0) : -1);
+      if (ncA) {
+        auto minD = checkResults(outStream, treeName3A, ft, vtxGen, genParent, dchdec);
+        meanDA += minD;
+        nfoundA++;
+      }
+
+      ft.setUseAbsDCA(true);
+      ft.setWeightedFinalPCA(true);
+      swAW.Start(false);
+      int ncAW = device::process(1, 1, ft, vctracks[0], vctracks[1], vctracks[2]); // HERE WE FIT THE VERTICES
+      swAW.Stop();
+      LOG(debug) << "fit abs.dist " << iev << " NC: " << ncAW << " Chi2: " << (ncAW ? ft.getChi2AtPCACandidate(0) : -1);
+      if (ncAW) {
+        auto minD = checkResults(outStream, treeName3AW, ft, vtxGen, genParent, dchdec);
+        meanDAW += minD;
+        nfoundAW++;
+      }
+
+      ft.setUseAbsDCA(false);
+      ft.setWeightedFinalPCA(false);
+      swW.Start(false);
+      int ncW = device::process(1, 1, ft, vctracks[0], vctracks[1], vctracks[2]); // HERE WE FIT THE VERTICES
+      swW.Stop();
+      LOG(debug) << "fit wgh.dist " << iev << " NC: " << ncW << " Chi2: " << (ncW ? ft.getChi2AtPCACandidate(0) : -1);
+      if (ncW) {
+        auto minD = checkResults(outStream, treeName3W, ft, vtxGen, genParent, dchdec);
+        meanDW += minD;
+        nfoundW++;
+      }
+    }
+    device::print(1, 1, ft);
+    meanDA /= nfoundA ? nfoundA : 1;
+    meanDAW /= nfoundAW ? nfoundAW : 1;
+    meanDW /= nfoundW ? nfoundW : 1;
+    LOG(info) << "Processed " << NTest << " 3-prong vertices";
+    LOG(info) << "3-prongs with abs.dist minization: eff= " << float(nfoundA) / NTest
+              << " mean.dist to truth: " << meanDA << " CPU time: " << swA.CpuTime();
+    LOG(info) << "3-prongs with abs.dist but wghPCA: eff= " << float(nfoundAW) / NTest
+              << " mean.dist to truth: " << meanDAW << " CPU time: " << swAW.CpuTime();
+    LOG(info) << "3-prongs with wgh.dist minization: eff= " << float(nfoundW) / NTest
+              << " mean.dist to truth: " << meanDW << " CPU time: " << swW.CpuTime();
+    BOOST_CHECK(nfoundA > 0.99 * NTest);
+    BOOST_CHECK(nfoundAW > 0.99 * NTest);
+    BOOST_CHECK(nfoundW > 0.99 * NTest);
+    BOOST_CHECK(meanDA < 0.1);
+    BOOST_CHECK(meanDAW < 0.1);
+    BOOST_CHECK(meanDW < 0.1);
+  }
 
   outStream.Close();
 }
diff --git a/Common/DCAFitter/GPU/hip/CMakeLists.txt b/Common/DCAFitter/GPU/hip/CMakeLists.txt
index 39ddce02209ed..272d18a81bab4 100644
--- a/Common/DCAFitter/GPU/hip/CMakeLists.txt
+++ b/Common/DCAFitter/GPU/hip/CMakeLists.txt
@@ -9,7 +9,6 @@
 # granted to it by virtue of its status as an Intergovernmental Organization
 # or submit itself to any jurisdiction.
 
-add_compile_definitions(GPUCA_GPU_DEBUG_PRINT)
 set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -fgpu-rdc")
 o2_add_hipified_library(DCAFitterHIP
                         SOURCES ../cuda/DCAFitterN.cu
diff --git a/Common/DCAFitter/include/DCAFitter/DCAFitterN.h b/Common/DCAFitter/include/DCAFitter/DCAFitterN.h
index 732bf058bd013..4ab7098b87052 100644
--- a/Common/DCAFitter/include/DCAFitter/DCAFitterN.h
+++ b/Common/DCAFitter/include/DCAFitter/DCAFitterN.h
@@ -1136,31 +1136,21 @@ GPUdi() bool DCAFitterN<N, Args...>::propagateToX(o2::track::TrackParCov& t, flo
 
 using DCAFitter2 = DCAFitterN<2, o2::track::TrackParCov>;
 using DCAFitter3 = DCAFitterN<3, o2::track::TrackParCov>;
-#ifdef GPUCA_GPUCODE
-namespace gpu::kernel
-{
-GPUg() void printKernel(o2::vertexing::DCAFitterN<2>* ft);
-GPUg() void processKernel(o2::vertexing::DCAFitterN<2>* ft, o2::track::TrackParCov* t1, o2::track::TrackParCov* t2, int* res);
-} // namespace gpu::kernel
-#endif
+
 namespace device
 {
-void print(o2::vertexing::DCAFitterN<2>&,
-           const int nBlocks = 1,
-           const int nThreads = 1);
-
-int process(o2::vertexing::DCAFitterN<2>&,
-            o2::track::TrackParCov&,
-            o2::track::TrackParCov&,
-            const int nBlocks = 1,
-            const int nThreads = 1);
-
-template <int N, class... Tr>
-int process(o2::vertexing::DCAFitterN<2>&,
-            const int nBlocks = 1,
-            const int nThreads = 1,
-            Args&... args);
+template <typename Fitter>
+void print(const int nBlocks,
+           const int nThreads,
+           Fitter& ft);
+
+template <typename Fitter, class... Tr>
+int process(const int nBlocks,
+            const int nThreads,
+            Fitter&,
+            Tr&... args);
 } // namespace device
+
 } // namespace vertexing
 } // namespace o2
 #endif // _ALICEO2_DCA_FITTERN_