From 09fdad9cfc73908d5d08733eb4db87ae7125cd3f Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Fri, 6 Sep 2024 16:02:32 +0200
Subject: [PATCH] Remove API interface, cleanup and restore test

---
 Common/DCAFitter/GPU/cuda/CMakeLists.txt      |  27 +---
 Common/DCAFitter/GPU/cuda/DCAFitterGPUAPI.cu  |  86 -----------
 Common/DCAFitter/GPU/cuda/DCAFitterGPUAPI.h   |  20 ---
 Common/DCAFitter/GPU/cuda/DCAFitterN.cu       |  52 ++++++-
 Common/DCAFitter/GPU/cuda/test/main_prog.cxx  | 138 ------------------
 .../GPU/cuda/test/testDCAFitterNGPU.cxx       |  19 +--
 Common/DCAFitter/GPU/hip/CMakeLists.txt       |  35 ++---
 .../DCAFitter/include/DCAFitter/DCAFitterN.h  |  35 +++--
 8 files changed, 87 insertions(+), 325 deletions(-)
 delete mode 100644 Common/DCAFitter/GPU/cuda/DCAFitterGPUAPI.cu
 delete mode 100644 Common/DCAFitter/GPU/cuda/DCAFitterGPUAPI.h
 delete mode 100644 Common/DCAFitter/GPU/cuda/test/main_prog.cxx

diff --git a/Common/DCAFitter/GPU/cuda/CMakeLists.txt b/Common/DCAFitter/GPU/cuda/CMakeLists.txt
index 1c7a2572fd424..a498d0c350202 100644
--- a/Common/DCAFitter/GPU/cuda/CMakeLists.txt
+++ b/Common/DCAFitter/GPU/cuda/CMakeLists.txt
@@ -9,8 +9,6 @@
 # granted to it by virtue of its status as an Intergovernmental Organization
 # or submit itself to any jurisdiction.
 
-SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0")
-
 o2_add_library(DCAFitterCUDA
                TARGETVARNAME targetName
                SOURCES DCAFitterN.cu
@@ -21,35 +19,14 @@ o2_add_library(DCAFitterCUDA
                PRIVATE_LINK_LIBRARIES O2::GPUTrackingCUDAExternalProvider)
 set_property(TARGET ${targetName} PROPERTY CUDA_SEPARABLE_COMPILATION ON)
 
-o2_add_library(DCAFitterAPICUDA
-              # TARGETVARNAME targetName2
-               SOURCES DCAFitterGPUAPI.cu
-               PUBLIC_INCLUDE_DIRECTORIES ${CMAKE_SOURCE_DIR}/Common/DCAFitter/GPU
-               PUBLIC_LINK_LIBRARIES O2::GPUCommon
-                                     O2::ReconstructionDataFormats
-               PRIVATE_LINK_LIBRARIES O2::DCAFitterCUDA
-               # O2::GPUTrackingCUDAExternalProvider
-                )
-# set_property(TARGET ${targetName2} PROPERTY CUDA_SEPARABLE_COMPILATION ON)
-
 o2_add_test(DCAFitterNCUDA
             SOURCES test/testDCAFitterNGPU.cxx
             PUBLIC_LINK_LIBRARIES O2::ReconstructionDataFormats
-                                  O2::DCAFitterAPICUDA
+                                  O2::DCAFitterCUDA
                                   O2::DCAFitter
                                   ROOT::Core
                                   ROOT::Physics
             COMPONENT_NAME gpu
             LABELS vertexing
             ENVIRONMENT O2_ROOT=${CMAKE_BINARY_DIR}/stage
-            VMCWORKDIR=${CMAKE_BINARY_DIR}/stage/${CMAKE_INSTALL_DATADIR})
-
-o2_add_executable(dca-fitter-on-gpu-cuda
-                  TARGETVARNAME targetName3
-                  SOURCES test/main_prog.cxx
-                  PUBLIC_LINK_LIBRARIES O2::ReconstructionDataFormats
-                                        O2::DCAFitterCUDA
-                                        O2::DCAFitter
-                                        ROOT::Core
-                                        ROOT::Physics)
-# add_compile_options("-g -O0")
\ No newline at end of file
+            VMCWORKDIR=${CMAKE_BINARY_DIR}/stage/${CMAKE_INSTALL_DATADIR})
\ No newline at end of file
diff --git a/Common/DCAFitter/GPU/cuda/DCAFitterGPUAPI.cu b/Common/DCAFitter/GPU/cuda/DCAFitterGPUAPI.cu
deleted file mode 100644
index 1b5b509965210..0000000000000
--- a/Common/DCAFitter/GPU/cuda/DCAFitterGPUAPI.cu
+++ /dev/null
@@ -1,86 +0,0 @@
-// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
-// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
-// All rights not expressly granted are reserved.
-//
-// This software is distributed under the terms of the GNU General Public
-// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
-//
-// In applying this license CERN does not waive the privileges and immunities
-// granted to it by virtue of its status as an Intergovernmental Organization
-// or submit itself to any jurisdiction.
-
-#ifdef __HIPCC__
-#include "hip/hip_runtime.h"
-#else
-#include <cuda.h>
-#endif
-
-#include "DCAFitter/DCAFitterN.h"
-
-#include "ReconstructionDataFormats/Track.h"
-
-#include <iostream>
-#include <cstdint>
-
-#define gpuCheckError(x)                \
-  {                                     \
-    gpuAssert((x), __FILE__, __LINE__); \
-  }
-inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
-{
-  if (code != cudaSuccess) {
-    std::cout << "GPUassert: " << cudaGetErrorString(code) << " " << file << " " << line << std::endl;
-    if (abort) {
-      throw std::runtime_error("GPU assert failed.");
-    }
-  }
-}
-
-namespace o2::vertexing::gpu
-{
-/// Print fitter
-void doPrintOnDevice(o2::vertexing::DCAFitterN<2>* ft)
-{
-  DCAFitterN<2>* ft_device;
-  gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&ft_device), sizeof(o2::vertexing::DCAFitterN<2>)));
-  gpuCheckError(cudaMemcpy(ft_device, ft, sizeof(o2::vertexing::DCAFitterN<2>), cudaMemcpyHostToDevice));
-  LOGP(info, "ft: {} ft_device: {} size: {}", (void*)ft, (void*)ft_device, sizeof(o2::vertexing::DCAFitterN<2>));
-  kernel::printKernel<<<1, 2049>>>(ft_device);
-  LOGP(info, "here.");
-  gpuCheckError(cudaPeekAtLastError());
-  gpuCheckError(cudaDeviceSynchronize());
-  LOGP(info, "here 2.");
-
-  gpuCheckError(cudaFree(ft_device));
-}
-
-/// Call the process(track, ...) method
-int doProcessOnDevice(o2::vertexing::DCAFitterN<2>* ft, o2::track::TrackParCov* t1, o2::track::TrackParCov* t2)
-{
-  DCAFitterN<2>* ft_device;
-  o2::track::TrackParCov* t1_device;
-  o2::track::TrackParCov* t2_device;
-  int result, *result_device;
-
-  gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&ft_device), sizeof(o2::vertexing::DCAFitterN<2>)));
-  gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&t1_device), sizeof(o2::track::TrackParCov)));
-  gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&t2_device), sizeof(o2::track::TrackParCov)));
-  gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&result_device), sizeof(int)));
-
-  gpuCheckError(cudaMemcpy(ft_device, ft, sizeof(o2::vertexing::DCAFitterN<2>), cudaMemcpyHostToDevice));
-  gpuCheckError(cudaMemcpy(t1_device, t1, sizeof(o2::track::TrackParCov), cudaMemcpyHostToDevice));
-  gpuCheckError(cudaMemcpy(t2_device, t2, sizeof(o2::track::TrackParCov), cudaMemcpyHostToDevice));
-
-  gpu::kernel::processKernel<<<1, 1>>>(ft_device, t1_device, t2_device, result_device);
-  gpuCheckError(cudaPeekAtLastError());
-  gpuCheckError(cudaDeviceSynchronize());
-
-  gpuCheckError(cudaMemcpy(&result, result_device, sizeof(int), cudaMemcpyDeviceToHost));
-  gpuCheckError(cudaFree(ft_device));
-  gpuCheckError(cudaFree(t1_device));
-  gpuCheckError(cudaFree(t2_device));
-  gpuCheckError(cudaFree(result_device));
-
-  return result;
-}
-} // namespace o2::vertexing::gpu
\ No newline at end of file
diff --git a/Common/DCAFitter/GPU/cuda/DCAFitterGPUAPI.h b/Common/DCAFitter/GPU/cuda/DCAFitterGPUAPI.h
deleted file mode 100644
index d461bc861ba4c..0000000000000
--- a/Common/DCAFitter/GPU/cuda/DCAFitterGPUAPI.h
+++ /dev/null
@@ -1,20 +0,0 @@
-// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
-// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
-// All rights not expressly granted are reserved.
-//
-// This software is distributed under the terms of the GNU General Public
-// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
-//
-// In applying this license CERN does not waive the privileges and immunities
-// granted to it by virtue of its status as an Intergovernmental Organization
-// or submit itself to any jurisdiction.
-
-#ifndef DCAFITTERN_GPU_API_H_
-#define DCAFITTERN_GPU_API_H_
-
-namespace o2::vertexing::gpu
-{
-void doPrintOnDevice(o2::vertexing::DCAFitterN<2>* ft);
-int doProcessOnDevice(o2::vertexing::DCAFitterN<2>* ft, o2::track::TrackParCov* t1, o2::track::TrackParCov* t2);
-} // namespace o2::vertexing::gpu
-#endif
\ No newline at end of file
diff --git a/Common/DCAFitter/GPU/cuda/DCAFitterN.cu b/Common/DCAFitter/GPU/cuda/DCAFitterN.cu
index 413aaff642d28..9f513c499e52c 100644
--- a/Common/DCAFitter/GPU/cuda/DCAFitterN.cu
+++ b/Common/DCAFitter/GPU/cuda/DCAFitterN.cu
@@ -49,18 +49,58 @@ GPUg() void processKernel(o2::vertexing::DCAFitterN<2>* ft, o2::track::TrackParC
 {
   *res = ft->process(*t1, *t2);
 }
+} // namespace kernel
 
-void printKHost(o2::vertexing::DCAFitterN<2>* ft, int th, int bl)
+void printOnDevice(o2::vertexing::DCAFitterN<2>* ft,
+                   const int nBlocks,
+                   const int nThreads)
 {
   DCAFitterN<2>* ft_device;
   gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&ft_device), sizeof(o2::vertexing::DCAFitterN<2>)));
   gpuCheckError(cudaMemcpy(ft_device, ft, sizeof(o2::vertexing::DCAFitterN<2>), cudaMemcpyHostToDevice));
-  LOGP(info, "ft: {} ft_device: {} size: {}", (void*)ft, (void*)ft_device, sizeof(o2::vertexing::DCAFitterN<2>));
-  printKernel<<<bl, th>>>(ft);
+
+  kernel::printKernel<<<nBlocks, nThreads>>>(ft_device);
+
   gpuCheckError(cudaPeekAtLastError());
   gpuCheckError(cudaDeviceSynchronize());
-  // static_assert(false);
 }
-} // namespace kernel
 
-} // namespace o2::vertexing::gpu
+int processOnDevice(o2::vertexing::DCAFitterN<2>* fitter,
+                    o2::track::TrackParCov& track1,
+                    o2::track::TrackParCov& track2,
+                    const int nBlocks,
+                    const int nThreads)
+{
+  DCAFitterN<2>* ft_device;
+  o2::track::TrackParCov* t1_device;
+  o2::track::TrackParCov* t2_device;
+  int result, *result_device;
+
+  gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&ft_device), sizeof(o2::vertexing::DCAFitterN<2>)));
+  gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&t1_device), sizeof(o2::track::TrackParCov)));
+  gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&t2_device), sizeof(o2::track::TrackParCov)));
+  gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&result_device), sizeof(int)));
+
+  gpuCheckError(cudaMemcpy(ft_device, fitter, sizeof(o2::vertexing::DCAFitterN<2>), cudaMemcpyHostToDevice));
+  gpuCheckError(cudaMemcpy(t1_device, &track1, sizeof(o2::track::TrackParCov), cudaMemcpyHostToDevice));
+  gpuCheckError(cudaMemcpy(t2_device, &track2, sizeof(o2::track::TrackParCov), cudaMemcpyHostToDevice));
+
+  kernel::processKernel<<<nBlocks, nThreads>>>(ft_device, t1_device, t2_device, result_device);
+
+  gpuCheckError(cudaPeekAtLastError());
+  gpuCheckError(cudaDeviceSynchronize());
+
+  gpuCheckError(cudaMemcpy(&result, result_device, sizeof(int), cudaMemcpyDeviceToHost));
+  gpuCheckError(cudaMemcpy(fitter, ft_device, sizeof(o2::vertexing::DCAFitterN<2>), cudaMemcpyDeviceToHost));
+  gpuCheckError(cudaMemcpy(&track1, t1_device, sizeof(o2::track::TrackParCov), cudaMemcpyDeviceToHost));
+  gpuCheckError(cudaMemcpy(&track2, t2_device, sizeof(o2::track::TrackParCov), cudaMemcpyDeviceToHost));
+  gpuCheckError(cudaFree(ft_device));
+  gpuCheckError(cudaFree(t1_device));
+  gpuCheckError(cudaFree(t2_device));
+
+  gpuCheckError(cudaFree(result_device));
+
+  return result;
+}
+
+} // namespace o2::vertexing::gpu
\ No newline at end of file
diff --git a/Common/DCAFitter/GPU/cuda/test/main_prog.cxx b/Common/DCAFitter/GPU/cuda/test/main_prog.cxx
deleted file mode 100644
index d18ed439411c4..0000000000000
--- a/Common/DCAFitter/GPU/cuda/test/main_prog.cxx
+++ /dev/null
@@ -1,138 +0,0 @@
-// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
-// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
-// All rights not expressly granted are reserved.
-//
-// This software is distributed under the terms of the GNU General Public
-// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
-//
-// In applying this license CERN does not waive the privileges and immunities
-// granted to it by virtue of its status as an Intergovernmental Organization
-// or submit itself to any jurisdiction.
-
-#include "DCAFitter/DCAFitterN.h"
-#include "CommonUtils/TreeStreamRedirector.h"
-#include <TRandom.h>
-#include <TGenPhaseSpace.h>
-#include <TLorentzVector.h>
-#include <TStopwatch.h>
-#include <Math/SVector.h>
-#include <array>
-
-// #include "../DCAFitterGPUAPI.h"
-
-namespace o2::vertexing
-{
-using Vec3D = ROOT::Math::SVector<double, 3>;
-TLorentzVector generate(Vec3D& vtx, std::vector<o2::track::TrackParCov>& vctr, float bz,
-                        TGenPhaseSpace& genPHS, double parMass, const std::vector<double>& dtMass, std::vector<int> forceQ)
-{
-  const float errYZ = 1e-2, errSlp = 1e-3, errQPT = 2e-2;
-  std::array<float, 15> covm = {
-    errYZ * errYZ,
-    0., errYZ * errYZ,
-    0, 0., errSlp * errSlp,
-    0., 0., 0., errSlp * errSlp,
-    0., 0., 0., 0., errQPT * errQPT};
-  bool accept = true;
-  TLorentzVector parent, d0, d1, d2;
-  do {
-    accept = true;
-    double y = gRandom->Rndm() - 0.5;
-    double pt = 0.1 + gRandom->Rndm() * 3;
-    double mt = TMath::Sqrt(parMass * parMass + pt * pt);
-    double pz = mt * TMath::SinH(y);
-    double phi = gRandom->Rndm() * TMath::Pi() * 2;
-    double en = mt * TMath::CosH(y);
-    double rdec = 10.; // radius of the decay
-    vtx[0] = rdec * TMath::Cos(phi);
-    vtx[1] = rdec * TMath::Sin(phi);
-    vtx[2] = rdec * pz / pt;
-    parent.SetPxPyPzE(pt * TMath::Cos(phi), pt * TMath::Sin(phi), pz, en);
-    int nd = dtMass.size();
-    genPHS.SetDecay(parent, nd, dtMass.data());
-    genPHS.Generate();
-    vctr.clear();
-    float p[4];
-    for (int i = 0; i < nd; i++) {
-      auto* dt = genPHS.GetDecay(i);
-      if (dt->Pt() < 0.05) {
-        accept = false;
-        break;
-      }
-      dt->GetXYZT(p);
-      float s, c, x;
-      std::array<float, 5> params;
-      o2::math_utils::sincos(dt->Phi(), s, c);
-      o2::math_utils::rotateZInv(vtx[0], vtx[1], x, params[0], s, c);
-
-      params[1] = vtx[2];
-      params[2] = 0.; // since alpha = phi
-      params[3] = 1. / TMath::Tan(dt->Theta());
-      params[4] = (i % 2 ? -1. : 1.) / dt->Pt();
-      covm[14] = errQPT * errQPT * params[4] * params[4];
-      //
-      // randomize
-      float r1, r2;
-      gRandom->Rannor(r1, r2);
-      params[0] += r1 * errYZ;
-      params[1] += r2 * errYZ;
-      gRandom->Rannor(r1, r2);
-      params[2] += r1 * errSlp;
-      params[3] += r2 * errSlp;
-      params[4] *= gRandom->Gaus(1., errQPT);
-      if (forceQ[i] == 0) {
-        params[4] = 0.; // impose straight track
-      }
-      auto& trc = vctr.emplace_back(x, dt->Phi(), params, covm);
-      float rad = forceQ[i] == 0 ? 600. : TMath::Abs(1. / trc.getCurvature(bz));
-      if (!trc.propagateTo(trc.getX() + (gRandom->Rndm() - 0.5) * rad * 0.05, bz) ||
-          !trc.rotate(trc.getAlpha() + (gRandom->Rndm() - 0.5) * 0.2)) {
-        printf("Failed to randomize ");
-        trc.print();
-      }
-    }
-  } while (!accept);
-
-  return parent;
-}
-
-int run()
-{
-  TGenPhaseSpace genPHS;
-  constexpr double ele = 0.00051;
-  constexpr double gamma = 2 * ele + 1e-6;
-  constexpr double pion = 0.13957;
-  constexpr double k0 = 0.49761;
-  constexpr double kch = 0.49368;
-  constexpr double dch = 1.86965;
-  std::vector<double> gammadec = {ele, ele};
-  std::vector<double> k0dec = {pion, pion};
-  std::vector<double> dchdec = {pion, kch, pion};
-  std::vector<o2::track::TrackParCov> vctracks;
-  Vec3D vtxGen;
-
-  double bz = 5.0;
-  std::vector<int> forceQ{1, 1};
-  o2::vertexing::DCAFitterN<2> ft; // 2 prong fitter
-  ft.setBz(bz);
-  ft.setPropagateToPCA(true);  // After finding the vertex, propagate tracks to the DCA. This is default anyway
-  ft.setMaxR(200);             // do not consider V0 seeds with 2D circles crossing above this R. This is default anyway
-  ft.setMaxDZIni(4);           // do not consider V0 seeds with tracks Z-distance exceeding this. This is default anyway
-  ft.setMaxDXYIni(4);          // do not consider V0 seeds with tracks XY-distance exceeding this. This is default anyway
-  ft.setMinParamChange(1e-3);  // stop iterations if max correction is below this value. This is default anyway
-  ft.setMinRelChi2Change(0.9); // stop iterations if chi2 improves by less that this factor
-
-  auto genParent = generate(vtxGen, vctracks, bz, genPHS, k0, k0dec, forceQ);
-  ft.setUseAbsDCA(true);
-  auto res = ft.process(vctracks[0], vctracks[1]);
-  ft.print();
-  std::cout << " => " << sizeof(DCAFitterN<2>) << std::endl;
-  std::cout << "returned value: " << res << std::endl;
-
-  // o2::vertexing::gpu::doPrintOnDevice(&ft);
-  o2::vertexing::gpu::kernel::printKHost(&ft, 1, 1);
-  return 0;
-}
-} // namespace o2::vertexing
-
-int main() { return o2::vertexing::run(); }
\ No newline at end of file
diff --git a/Common/DCAFitter/GPU/cuda/test/testDCAFitterNGPU.cxx b/Common/DCAFitter/GPU/cuda/test/testDCAFitterNGPU.cxx
index 9bcbd23354ae6..8f1ee3bc30644 100644
--- a/Common/DCAFitter/GPU/cuda/test/testDCAFitterNGPU.cxx
+++ b/Common/DCAFitter/GPU/cuda/test/testDCAFitterNGPU.cxx
@@ -23,8 +23,6 @@
 #include <Math/SVector.h>
 #include <array>
 
-#include "../DCAFitterGPUAPI.h"
-
 namespace o2
 {
 namespace vertexing
@@ -189,8 +187,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngs)
       auto genParent = generate(vtxGen, vctracks, bz, genPHS, k0, k0dec, forceQ);
       ft.setUseAbsDCA(true);
       swA.Start(false);
-      int ncA = o2::vertexing::gpu::doProcessOnDevice(&ft, &(vctracks[0]), &(vctracks[1]));
-      // int ncA = ft.process(vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
+      int ncA = gpu::processOnDevice(&ft, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
       swA.Stop();
       LOG(debug) << "fit abs.dist " << iev << " NC: " << ncA << " Chi2: " << (ncA ? ft.getChi2AtPCACandidate(0) : -1);
       if (ncA) {
@@ -202,8 +199,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngs)
       ft.setUseAbsDCA(true);
       ft.setWeightedFinalPCA(true);
       swAW.Start(false);
-      int ncAW = o2::vertexing::gpu::doProcessOnDevice(&ft, &(vctracks[0]), &(vctracks[1]));
-      // int ncAW = ft.process(vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
+      int ncAW = gpu::processOnDevice(&ft, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
       swAW.Stop();
       LOG(debug) << "fit abs.dist with final weighted DCA " << iev << " NC: " << ncAW << " Chi2: " << (ncAW ? ft.getChi2AtPCACandidate(0) : -1);
       if (ncAW) {
@@ -215,8 +211,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngs)
       ft.setUseAbsDCA(false);
       ft.setWeightedFinalPCA(false);
       swW.Start(false);
-      int ncW = o2::vertexing::gpu::doProcessOnDevice(&ft, &(vctracks[0]), &(vctracks[1]));
-      // int ncW = ft.process(vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
+      int ncW = gpu::processOnDevice(&ft, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
       swW.Stop();
       LOG(debug) << "fit wgh.dist " << iev << " NC: " << ncW << " Chi2: " << (ncW ? ft.getChi2AtPCACandidate(0) : -1);
       if (ncW) {
@@ -226,17 +221,17 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngs)
       }
     }
     ft.print();
-    o2::vertexing::gpu::doPrintOnDevice(&ft);
+    gpu::printOnDevice(&ft, 1, 1);
     meanDA /= nfoundA ? nfoundA : 1;
     meanDAW /= nfoundA ? nfoundA : 1;
     meanDW /= nfoundW ? nfoundW : 1;
     LOG(info) << "Processed " << NTest << " 2-prong vertices Helix : Helix";
     LOG(info) << "2-prongs with abs.dist minization: eff= " << float(nfoundA) / NTest
-              << " mean.dist to truth: " << meanDA << " CPU time: " << swA.CpuTime();
+              << " mean.dist to truth: " << meanDA << " GPU time: " << swA.CpuTime();
     LOG(info) << "2-prongs with abs.dist but wghPCA: eff= " << float(nfoundAW) / NTest
-              << " mean.dist to truth: " << meanDAW << " CPU time: " << swAW.CpuTime();
+              << " mean.dist to truth: " << meanDAW << " GPU time: " << swAW.CpuTime();
     LOG(info) << "2-prongs with wgh.dist minization: eff= " << float(nfoundW) / NTest
-              << " mean.dist to truth: " << meanDW << " CPU time: " << swW.CpuTime();
+              << " mean.dist to truth: " << meanDW << " GPU time: " << swW.CpuTime();
     BOOST_CHECK(nfoundA > 0.99 * NTest);
     // BOOST_CHECK(nfoundAW > 0.99 * NTest);
     // BOOST_CHECK(nfoundW > 0.99 * NTest);
diff --git a/Common/DCAFitter/GPU/hip/CMakeLists.txt b/Common/DCAFitter/GPU/hip/CMakeLists.txt
index 2b5735a9cbe59..d270885adee2c 100644
--- a/Common/DCAFitter/GPU/hip/CMakeLists.txt
+++ b/Common/DCAFitter/GPU/hip/CMakeLists.txt
@@ -22,29 +22,18 @@ o2_add_hipified_library(DCAFitterHIP
 # target_compile_options(${targetName} PRIVATE $<$<COMPILE_LANGUAGE:HIP>:-fgpu-rdc>)
 # target_link_options(${targetName} PRIVATE $<$<COMPILE_LANGUAGE:HIP>:-fgpu-rdc>)
 
-# o2_add_hipified_library(DCAFitterAPIHIP
-#                         SOURCES ../cuda/DCAFitterGPUAPI.cu
-#                         ../cuda/DCAFitterN.cu
-#                         PUBLIC_INCLUDE_DIRECTORIES ${CMAKE_SOURCE_DIR}/Common/DCAFitter/GPU
-#                                                    ../cuda
-#                         PUBLIC_LINK_LIBRARIES O2::GPUCommon
-#                                               O2::ReconstructionDataFormats
-#                         PRIVATE_LINK_LIBRARIES O2::DCAFitterHIP 
-#                         # O2::GPUTrackingHIPExternalProvider
-#                         TARGETVARNAME targetName)
-
-# o2_add_test(DCAFitterNHIP
-#             SOURCES ../cuda/test/testDCAFitterNGPU.cxx
-#             PUBLIC_LINK_LIBRARIES O2::ReconstructionDataFormats
-#                                   O2::DCAFitterAPIHIP
-#                                   O2::DCAFitter
-#                                   ROOT::Core
-#                                   ROOT::Physics
-#             HIPIFIED test
-#             COMPONENT_NAME gpu
-#             LABELS vertexing
-#             ENVIRONMENT O2_ROOT=${CMAKE_BINARY_DIR}/stage
-#             VMCWORKDIR=${CMAKE_BINARY_DIR}/stage/${CMAKE_INSTALL_DATADIR})
+o2_add_test(DCAFitterNHIP
+            SOURCES ../cuda/test/testDCAFitterNGPU.cxx
+            PUBLIC_LINK_LIBRARIES O2::ReconstructionDataFormats
+                                  O2::DCAFitterHIP
+                                  O2::DCAFitter
+                                  ROOT::Core
+                                  ROOT::Physics
+            HIPIFIED test
+            COMPONENT_NAME gpu
+            LABELS vertexing
+            ENVIRONMENT O2_ROOT=${CMAKE_BINARY_DIR}/stage
+            VMCWORKDIR=${CMAKE_BINARY_DIR}/stage/${CMAKE_INSTALL_DATADIR})
 
 
 
diff --git a/Common/DCAFitter/include/DCAFitter/DCAFitterN.h b/Common/DCAFitter/include/DCAFitter/DCAFitterN.h
index 550aff307d658..ce8f2f774508e 100644
--- a/Common/DCAFitter/include/DCAFitter/DCAFitterN.h
+++ b/Common/DCAFitter/include/DCAFitter/DCAFitterN.h
@@ -1011,22 +1011,20 @@ GPUd() bool DCAFitterN<N, Args...>::closerToAlternative() const
 template <int N, typename... Args>
 GPUd() void DCAFitterN<N, Args...>::print() const
 {
-#if !defined(GPUCA_GPUCODE)
+#ifndef GPUCA_GPUCODE_DEVICE
   LOG(info) << N << "-prong vertex fitter in " << (mUseAbsDCA ? "abs." : "weighted") << " distance minimization mode";
   LOG(info) << "Bz: " << mBz << " MaxIter: " << mMaxIter << " MaxChi2: " << mMaxChi2;
   LOG(info) << "Stopping condition: Max.param change < " << mMinParamChange << " Rel.Chi2 change > " << mMinRelChi2Change;
   LOG(info) << "Discard candidates for : Rvtx > " << getMaxR() << " DZ between tracks > " << mMaxDZIni;
-#endif
-#if defined(GPUCA_GPUCODE_DEVICE)
-  // if (mUseAbsDCA) {
-  printf("test %d\n", sizeof(DCAFitterN<2>));
-  // printf("%d-prong vertex fitter in abs. distance minimization mode\n", N);
-  // // } else {
-  // printf("%d-prong vertex fitter in weighted distance minimization mode\n", N);
-  // // }
-  // printf("Bz: %f MaxIter: %d  MaxChi2: %f\n", mBz, mMaxIter, mMaxChi2);
-  // printf("Stopping condition: Max.param change < %f Rel.Chi2 change > %f\n", mMinParamChange, mMinRelChi2Change);
-  // printf("Discard candidates for : Rvtx > %f DZ between tracks > %f", getMaxR(), mMaxDZIni);
+#else
+  if (mUseAbsDCA) {
+    printf("%d-prong vertex fitter in abs. distance minimization mode\n", N);
+  } else {
+    printf("%d-prong vertex fitter in weighted distance minimization mode\n", N);
+  }
+  printf("Bz: %f MaxIter: %d  MaxChi2: %f\n", mBz, mMaxIter, mMaxChi2);
+  printf("Stopping condition: Max.param change < %f Rel.Chi2 change > %f\n", mMinParamChange, mMinRelChi2Change);
+  printf("Discard candidates for : Rvtx > %f DZ between tracks > %f\n", getMaxR(), mMaxDZIni);
 #endif
 }
 
@@ -1131,10 +1129,17 @@ GPUg() void printKernel(o2::vertexing::DCAFitterN<2>* ft);
 GPUg() void processKernel(o2::vertexing::DCAFitterN<2>* ft, o2::track::TrackParCov* t1, o2::track::TrackParCov* t2, int* res);
 } // namespace gpu::kernel
 #endif
-namespace gpu::kernel
+namespace gpu
 {
-void printKHost(o2::vertexing::DCAFitterN<2>* ft, int th, int bl);
-}
+void printOnDevice(o2::vertexing::DCAFitterN<2>*,
+                   const int nBlocks = 1,
+                   const int nThreads = 1);
+int processOnDevice(o2::vertexing::DCAFitterN<2>*,
+                    o2::track::TrackParCov&,
+                    o2::track::TrackParCov&,
+                    const int nBlocks = 1,
+                    const int nThreads = 1);
+} // namespace gpu
 } // namespace vertexing
 } // namespace o2
 #endif // _ALICEO2_DCA_FITTERN_