From 5b6577c1d3a6bdaf71a2e2f713d1ff5df8f40254 Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Sat, 26 Oct 2024 14:28:12 +0200
Subject: [PATCH] cp

---
 .../MathUtils/include/MathUtils/SMatrixGPU.h  |  37 +++
 .../TrackParametrizationWithError.h           |  13 +-
 .../src/TrackParametrizationWithError.cxx     |  52 ++--
 .../GPU/ITStrackingGPU/TimeFrameGPU.h         |  26 +-
 .../GPU/ITStrackingGPU/TrackingKernels.h      |  27 ++
 .../ITS/tracking/GPU/cuda/CMakeLists.txt      |   4 +-
 .../ITS/tracking/GPU/cuda/TimeFrameGPU.cu     | 252 ++++++++++--------
 .../tracking/GPU/cuda/TrackerTraitsGPU.cxx    |  88 +++---
 .../ITS/tracking/GPU/cuda/TrackingKernels.cu  | 130 ++++++---
 .../tracking/include/ITStracking/TimeFrame.h  |   2 +-
 .../ITSMFT/ITS/tracking/src/TrackerTraits.cxx |  10 +
 11 files changed, 427 insertions(+), 214 deletions(-)
diff --git a/Common/MathUtils/include/MathUtils/SMatrixGPU.h b/Common/MathUtils/include/MathUtils/SMatrixGPU.h
index ef76c490ddfbd..60965a4fa2776 100644
--- a/Common/MathUtils/include/MathUtils/SMatrixGPU.h
+++ b/Common/MathUtils/include/MathUtils/SMatrixGPU.h
@@ -446,6 +446,8 @@ class SMatrixGPU
   GPUdi() SMatrixGPU(SMatrixNoInit) {}
   GPUd() SMatrixGPU(SMatrixIdentity);
   GPUd() SMatrixGPU(const SMatrixGPU<T, D1, D2, R>& rhs);
+  template <class R2>
+  GPUd() SMatrixGPU(const SMatrixGPU<T, D1, D2, R2>& rhs);
   template <class A, class R2>
   GPUd() SMatrixGPU(const Expr<A, T, D1, D2, R2>& rhs);
   template <class M>
@@ -497,6 +499,11 @@ class SMatrixGPU
   GPUd() SMatrixRowGPU operator[](unsigned int i) { return SMatrixRowGPU(*this, i); }
   template <class R2>
   GPUd() SMatrixGPU<T, D1, D2, R>& operator+=(const SMatrixGPU<T, D1, D2, R2>& rhs);
+  GPUd() SMatrixGPU<T, D1, D2, R>& operator*=(const T& rhs);
+  template <class R2>
+  GPUd() SMatrixGPU<T, D1, D2, R>& operator*=(const SMatrixGPU<T, D1, D2, R2>& rhs);
+  template <class A, class R2>
+  GPUd() SMatrixGPU<T, D1, D2, R>& operator*=(const Expr<A, T, D1, D2, R2>& rhs);
 
   GPUd() bool Invert();
   GPUd() bool IsInUse(const T* p) const;
@@ -528,6 +535,13 @@ GPUdi() SMatrixGPU<T, D1, D2, R>::SMatrixGPU(const SMatrixGPU<T, D1, D2, R>& rhs
   mRep = rhs.mRep;
 }
 
+template <class T, unsigned int D1, unsigned int D2, class R>
+template <class R2>
+GPUd() SMatrixGPU<T, D1, D2, R>::SMatrixGPU(const SMatrixGPU<T, D1, D2, R2>& rhs)
+{
+  operator=(rhs);
+}
+
 template <class T, unsigned int D1, unsigned int D2, class R>
 GPUdi() T* SMatrixGPU<T, D1, D2, R>::begin()
 {
@@ -1387,6 +1401,29 @@ GPUdi() SMatrixGPU<T, D1, D2, R>& SMatrixGPU<T, D1, D2, R>::operator+=(const SMa
   return *this;
 }
 
+template <class T, unsigned int D1, unsigned int D2, class R>
+GPUdi() SMatrixGPU<T, D1, D2, R>& SMatrixGPU<T, D1, D2, R>::operator*=(const T & rhs)
+{
+  for (unsigned int i = 0; i < R::kSize; ++i) {
+    mRep.Array()[i] *= rhs;
+  }
+  return *this;
+}
+
+template <class T, unsigned int D1, unsigned int D2, class R>
+template <class R2>
+GPUdi() SMatrixGPU<T, D1, D2, R>& SMatrixGPU<T, D1, D2, R>::operator*=(const SMatrixGPU<T, D1, D2, R2>& rhs)
+{
+  return operator=(*this* rhs);
+}
+
+template <class T, unsigned int D1, unsigned int D2, class R>
+template <class A, class R2>
+GPUdi() SMatrixGPU<T, D1, D2, R>& SMatrixGPU<T, D1, D2, R>::operator*=(const Expr<A, T, D1, D2, R2>& rhs)
+{
+  return operator=(*this* rhs);
+}
+
 template <class T, unsigned int D1, unsigned int D2, class R>
 struct TranspPolicyGPU {
   enum {
diff --git a/DataFormats/Reconstruction/include/ReconstructionDataFormats/TrackParametrizationWithError.h b/DataFormats/Reconstruction/include/ReconstructionDataFormats/TrackParametrizationWithError.h
index 536bacf1a6a70..015b5d37e258c 100644
--- a/DataFormats/Reconstruction/include/ReconstructionDataFormats/TrackParametrizationWithError.h
+++ b/DataFormats/Reconstruction/include/ReconstructionDataFormats/TrackParametrizationWithError.h
@@ -18,6 +18,7 @@
 #define INCLUDE_RECONSTRUCTIONDATAFORMATS_TRACKPARAMETRIZATIONWITHERROR_H_
 
 #include "ReconstructionDataFormats/TrackParametrization.h"
+#include <MathUtils/Cartesian.h>
 
 namespace o2
 {
@@ -38,8 +39,8 @@ class TrackParametrizationWithError : public TrackParametrization<value_T>
 #endif
 
   using covMat_t = gpu::gpustd::array<value_t, kCovMatSize>;
-  using MatrixDSym5 = ROOT::Math::SMatrix<double, kNParams, kNParams, ROOT::Math::MatRepSym<double, kNParams>>;
-  using MatrixD5 = ROOT::Math::SMatrix<double, kNParams, kNParams, ROOT::Math::MatRepStd<double, kNParams, kNParams>>;
+  using MatrixDSym5 = o2::math_utils::SMatrix<double, kNParams, kNParams, o2::math_utils::MatRepSym<double, kNParams>>;
+  using MatrixD5 = o2::math_utils::SMatrix<double, kNParams, kNParams, o2::math_utils::MatRepStd<double, kNParams, kNParams>>;
 
   GPUd() TrackParametrizationWithError();
   GPUd() TrackParametrizationWithError(value_t x, value_t alpha, const params_t& par, const covMat_t& cov, int charge = 1, const PID pid = PID::Pion);
@@ -100,12 +101,12 @@ class TrackParametrizationWithError : public TrackParametrization<value_T>
   template <typename T>
   GPUd() value_t getPredictedChi2(const BaseCluster<T>& p) const;
 
-  void buildCombinedCovMatrix(const TrackParametrizationWithError& rhs, MatrixDSym5& cov) const;
-  value_t getPredictedChi2(const TrackParametrizationWithError& rhs, MatrixDSym5& covToSet) const;
+  GPUd() void buildCombinedCovMatrix(const TrackParametrizationWithError& rhs, MatrixDSym5& cov) const;
+  GPUd() value_t getPredictedChi2(const TrackParametrizationWithError& rhs, MatrixDSym5& covToSet) const;
   GPUd() value_t getPredictedChi2(const TrackParametrizationWithError& rhs) const;
   GPUd() value_t getPredictedChi2Quiet(const TrackParametrizationWithError& rhs) const;
-  bool update(const TrackParametrizationWithError& rhs, const MatrixDSym5& covInv);
-  bool update(const TrackParametrizationWithError& rhs);
+  GPUd() bool update(const TrackParametrizationWithError& rhs, const MatrixDSym5& covInv);
+  GPUd() bool update(const TrackParametrizationWithError& rhs);
 
   GPUd() bool update(const dim2_t& p, const dim3_t& cov);
   GPUd() bool update(const value_t* p, const value_t* cov);
diff --git a/DataFormats/Reconstruction/src/TrackParametrizationWithError.cxx b/DataFormats/Reconstruction/src/TrackParametrizationWithError.cxx
index e56830deace14..9f8a93a01e053 100644
--- a/DataFormats/Reconstruction/src/TrackParametrizationWithError.cxx
+++ b/DataFormats/Reconstruction/src/TrackParametrizationWithError.cxx
@@ -13,6 +13,7 @@
 #include "ReconstructionDataFormats/Vertex.h"
 #include "ReconstructionDataFormats/DCA.h"
 #include <GPUCommonLogger.h>
+#include <cfloat>
 
 #ifndef GPUCA_GPUCODE_DEVICE
 #include <iostream>
@@ -754,30 +755,6 @@ GPUd() auto TrackParametrizationWithError<value_T>::getPredictedChi2Quiet(const
   return (d * (szz * d - sdz * z) + z * (sdd * z - d * sdz)) / det;
 }
 
-#if !defined(GPUCA_GPUCODE) && !defined(GPUCA_STANDALONE) // Disable function relying on ROOT SMatrix on GPU
-
-//______________________________________________
-template <typename value_T>
-void TrackParametrizationWithError<value_T>::buildCombinedCovMatrix(const TrackParametrizationWithError<value_T>& rhs, MatrixDSym5& cov) const
-{
-  // fill combined cov.matrix (NOT inverted)
-  cov(kY, kY) = static_cast<double>(getSigmaY2()) + static_cast<double>(rhs.getSigmaY2());
-  cov(kZ, kY) = static_cast<double>(getSigmaZY()) + static_cast<double>(rhs.getSigmaZY());
-  cov(kZ, kZ) = static_cast<double>(getSigmaZ2()) + static_cast<double>(rhs.getSigmaZ2());
-  cov(kSnp, kY) = static_cast<double>(getSigmaSnpY()) + static_cast<double>(rhs.getSigmaSnpY());
-  cov(kSnp, kZ) = static_cast<double>(getSigmaSnpZ()) + static_cast<double>(rhs.getSigmaSnpZ());
-  cov(kSnp, kSnp) = static_cast<double>(getSigmaSnp2()) + static_cast<double>(rhs.getSigmaSnp2());
-  cov(kTgl, kY) = static_cast<double>(getSigmaTglY()) + static_cast<double>(rhs.getSigmaTglY());
-  cov(kTgl, kZ) = static_cast<double>(getSigmaTglZ()) + static_cast<double>(rhs.getSigmaTglZ());
-  cov(kTgl, kSnp) = static_cast<double>(getSigmaTglSnp()) + static_cast<double>(rhs.getSigmaTglSnp());
-  cov(kTgl, kTgl) = static_cast<double>(getSigmaTgl2()) + static_cast<double>(rhs.getSigmaTgl2());
-  cov(kQ2Pt, kY) = static_cast<double>(getSigma1PtY()) + static_cast<double>(rhs.getSigma1PtY());
-  cov(kQ2Pt, kZ) = static_cast<double>(getSigma1PtZ()) + static_cast<double>(rhs.getSigma1PtZ());
-  cov(kQ2Pt, kSnp) = static_cast<double>(getSigma1PtSnp()) + static_cast<double>(rhs.getSigma1PtSnp());
-  cov(kQ2Pt, kTgl) = static_cast<double>(getSigma1PtTgl()) + static_cast<double>(rhs.getSigma1PtTgl());
-  cov(kQ2Pt, kQ2Pt) = static_cast<double>(getSigma1Pt2()) + static_cast<double>(rhs.getSigma1Pt2());
-}
-
 //______________________________________________
 template <typename value_T>
 GPUd() auto TrackParametrizationWithError<value_T>::getPredictedChi2(const TrackParametrizationWithError<value_T>& rhs) const -> value_t
@@ -819,6 +796,28 @@ GPUd() auto TrackParametrizationWithError<value_T>::getPredictedChi2(const Track
   return chi2diag + 2. * chi2ndiag;
 }
 
+//______________________________________________
+template <typename value_T>
+GPUd() void TrackParametrizationWithError<value_T>::buildCombinedCovMatrix(const TrackParametrizationWithError<value_T>& rhs, MatrixDSym5& cov) const
+{
+  // fill combined cov.matrix (NOT inverted)
+  cov(kY, kY) = static_cast<double>(getSigmaY2()) + static_cast<double>(rhs.getSigmaY2());
+  cov(kZ, kY) = static_cast<double>(getSigmaZY()) + static_cast<double>(rhs.getSigmaZY());
+  cov(kZ, kZ) = static_cast<double>(getSigmaZ2()) + static_cast<double>(rhs.getSigmaZ2());
+  cov(kSnp, kY) = static_cast<double>(getSigmaSnpY()) + static_cast<double>(rhs.getSigmaSnpY());
+  cov(kSnp, kZ) = static_cast<double>(getSigmaSnpZ()) + static_cast<double>(rhs.getSigmaSnpZ());
+  cov(kSnp, kSnp) = static_cast<double>(getSigmaSnp2()) + static_cast<double>(rhs.getSigmaSnp2());
+  cov(kTgl, kY) = static_cast<double>(getSigmaTglY()) + static_cast<double>(rhs.getSigmaTglY());
+  cov(kTgl, kZ) = static_cast<double>(getSigmaTglZ()) + static_cast<double>(rhs.getSigmaTglZ());
+  cov(kTgl, kSnp) = static_cast<double>(getSigmaTglSnp()) + static_cast<double>(rhs.getSigmaTglSnp());
+  cov(kTgl, kTgl) = static_cast<double>(getSigmaTgl2()) + static_cast<double>(rhs.getSigmaTgl2());
+  cov(kQ2Pt, kY) = static_cast<double>(getSigma1PtY()) + static_cast<double>(rhs.getSigma1PtY());
+  cov(kQ2Pt, kZ) = static_cast<double>(getSigma1PtZ()) + static_cast<double>(rhs.getSigma1PtZ());
+  cov(kQ2Pt, kSnp) = static_cast<double>(getSigma1PtSnp()) + static_cast<double>(rhs.getSigma1PtSnp());
+  cov(kQ2Pt, kTgl) = static_cast<double>(getSigma1PtTgl()) + static_cast<double>(rhs.getSigma1PtTgl());
+  cov(kQ2Pt, kQ2Pt) = static_cast<double>(getSigma1Pt2()) + static_cast<double>(rhs.getSigma1Pt2());
+}
+
 //______________________________________________
 template <typename value_T>
 GPUd() bool TrackParametrizationWithError<value_T>::update(const TrackParametrizationWithError<value_T>& rhs, const MatrixDSym5& covInv)
@@ -867,7 +866,7 @@ GPUd() bool TrackParametrizationWithError<value_T>::update(const TrackParametriz
   }
 
   // updated covariance: Cov0 = Cov0 - K*Cov0
-  matK *= ROOT::Math::SMatrix<double, kNParams, kNParams, ROOT::Math::MatRepStd<double, kNParams>>(matC0);
+  matK *= o2::math_utils::SMatrix<double, kNParams, kNParams, o2::math_utils::MatRepStd<double, kNParams>>(matC0);
   mC[kSigY2] -= matK(kY, kY);
   mC[kSigZY] -= matK(kZ, kY);
   mC[kSigZ2] -= matK(kZ, kZ);
@@ -901,8 +900,6 @@ GPUd() bool TrackParametrizationWithError<value_T>::update(const TrackParametriz
   return update(rhs, covI);
 }
 
-#endif
-
 //______________________________________________
 template <typename value_T>
 GPUd() bool TrackParametrizationWithError<value_T>::update(const value_t* p, const value_t* cov)
@@ -1245,6 +1242,7 @@ GPUd() void TrackParametrizationWithError<value_T>::printHexadecimal()
 
 namespace o2::track
 {
+
 #if !defined(GPUCA_GPUCODE) || defined(GPUCA_GPUCODE_DEVICE) // FIXME: DR: WORKAROUND to avoid CUDA bug creating host symbols for device code.
 template class TrackParametrizationWithError<float>;
 #endif
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h
index b0f9e8ac55c6f..9b39416042f7b 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h
@@ -26,12 +26,9 @@
 
 namespace o2
 {
-// namespace gpu
-// {
-// // class GPUChainITS;
-// }
 namespace its
-{namespace gpu
+{
+namespace gpu
 {
 
 class DefaultGPUAllocator : public ExternalAllocator
@@ -59,13 +56,17 @@ class TimeFrameGPU : public TimeFrame
   void loadClustersDevice();
   void loadTrackletsDevice();
   void loadCellsDevice();
+  void loadCellsLUT();
   void loadTrackSeedsDevice();
   void loadTrackSeedsChi2Device();
   void loadRoadsDevice();
   void loadTrackSeedsDevice(std::vector<CellSeed>&);
-  void createCellNeighboursDevice(const unsigned int& layer, std::vector<std::pair<int, int>>& neighbours);
+  void createNeighboursDevice(const unsigned int& layer, std::vector<std::pair<int, int>>& neighbours);
+  void createNeighboursLUTDevice(const int, const unsigned int);
   void createTrackITSExtDevice(std::vector<CellSeed>&);
   void downloadTrackITSExtDevice(std::vector<CellSeed>&);
+  void downloadCellsNeighbours(std::vector<std::vector<std::pair<int, int>>>&, const int);
+  void downloadNeighboursLUT(std::vector<int>&, const int);
   void initDeviceChunks(const int, const int);
   template <Task task>
   size_t loadChunkData(const size_t, const size_t, const size_t);
@@ -92,6 +93,7 @@ class TimeFrameGPU : public TimeFrame
   // Hybrid
   Road<nLayers - 2>* getDeviceRoads() { return mRoadsDevice; }
   TrackITSExt* getDeviceTrackITSExt() { return mTrackITSExtDevice; }
+  int* getDeviceNeighboursLUT(const int layer) { return mNeighboursLUTDevice[layer]; }
   gpuPair<int, int>* getDeviceNeighbours(const int layer) { return mNeighboursDevice[layer]; }
   TrackingFrameInfo* getDeviceTrackingFrameInfo(const int);
   // TrackingFrameInfo** getDeviceArrayTrackingFrameInfo() { return mTrackingFrameInfoDeviceArray; }
@@ -99,10 +101,14 @@ class TimeFrameGPU : public TimeFrame
   Cluster** getDeviceArrayClusters() const { return mClustersDeviceArray; }
   Cluster** getDeviceArrayUnsortedClusters() const { return mUnsortedClustersDeviceArray; }
   Tracklet** getDeviceArrayTracklets() const { return mTrackletsDeviceArray; }
+  int** getDeviceArrayCellsLUT() const { return mCellsLUTDeviceArray; }
+  int** getDeviceArrayNeighboursCellLUT() const { return mNeighboursCellLUTDeviceArray; }
   CellSeed** getDeviceArrayCells() const { return mCellsDeviceArray; }
   CellSeed* getDeviceTrackSeeds() { return mTrackSeedsDevice; }
   o2::track::TrackParCovF** getDeviceArrayTrackSeeds() { return mCellSeedsDeviceArray; }
   float** getDeviceArrayTrackSeedsChi2() { return mCellSeedsChi2DeviceArray; }
+  int* getNeighboursIndexTablesDevice(const int layer) { return mNeighboursIndexTablesDevice[layer]; }
+
   void setDevicePropagator(const o2::base::PropagatorImpl<float>*) override;
 
   // Host-specific getters
@@ -131,9 +137,13 @@ class TimeFrameGPU : public TimeFrame
   Cluster** mUnsortedClustersDeviceArray;
   std::array<Tracklet*, nLayers - 1> mTrackletsDevice;
   Tracklet** mTrackletsDeviceArray;
-  std::array<int*, nLayers - 2> mCellsLookupTablesDevice;
-  int** mCellsLookupTablesDeviceArray;
+  std::array<int*, nLayers - 2> mCellsLUTDevice;
+  std::array<int*, nLayers - 3> mNeighboursLUTDevice;
+  int** mCellsLUTDeviceArray;
+  int** mNeighboursCellDeviceArray;
+  int** mNeighboursCellLUTDeviceArray;
   std::array<CellSeed*, nLayers - 2> mCellsDevice;
+  std::array<int*, nLayers - 2> mNeighboursIndexTablesDevice;
   CellSeed* mTrackSeedsDevice;
   CellSeed** mCellsDeviceArray;
   std::array<o2::track::TrackParCovF*, nLayers - 2> mCellSeedsDevice;
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h
index cc74456bbb1aa..c8b8d3080eea4 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h
@@ -49,6 +49,33 @@ GPUg() void fitTrackSeedsKernel(
   const o2::base::PropagatorF::MatCorrType matCorrType = o2::base::PropagatorF::MatCorrType::USEMatCorrLUT);
 #endif
 } // namespace gpu
+void countCellNeighboursHandler(CellSeed** cellsLayersDevice,
+                                int* neighboursLUTs,
+                                int** cellsLUTs,
+                                gpuPair<int, int>* cellNeighbours,
+                                int* neighboursIndexTable,
+                                const float maxChi2ClusterAttachment,
+                                const float bz,
+                                const int layerIndex,
+                                const unsigned int nCells,
+                                const unsigned int nCellsNext,
+                                const int maxCellNeighbours,
+                                const int nBlocks,
+                                const int nThreads);
+
+void computeCellNeighboursHandler(CellSeed** cellsLayersDevice,
+                                  int* neighboursLUTs,
+                                  int** cellsLUTs,
+                                  gpuPair<int, int>* cellNeighbours,
+                                  int* neighboursIndexTable,
+                                  const float maxChi2ClusterAttachment,
+                                  const float bz,
+                                  const int layerIndex,
+                                  const unsigned int nCells,
+                                  const unsigned int nCellsNext,
+                                  const int maxCellNeighbours,
+                                  const int nBlocks,
+                                  const int nThreads);
 
 void trackSeedHandler(CellSeed* trackSeeds,
                       const TrackingFrameInfo** foundTrackingFrameInfo,
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt
index 062009954a5e8..27630a4deeeef 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt
@@ -13,7 +13,7 @@
 if(CUDA_ENABLED)
 find_package(CUDAToolkit)
 message(STATUS "Building ITS CUDA tracker")
-
+add_compile_options(-O0 -g -fPIC)
 o2_add_library(ITStrackingCUDA
                SOURCES ClusterLinesGPU.cu
                        Context.cu
@@ -32,7 +32,7 @@ o2_add_library(ITStrackingCUDA
                                      O2::SimulationDataFormat
                                      O2::ReconstructionDataFormats
                                      O2::GPUCommon
-                                     CUDA::nvToolsExt # TODO: change to CUDA::nvtx3 when CMake bump >= 3.25
+                                     CUDA::nvToolsExt
                PRIVATE_LINK_LIBRARIES O2::GPUTrackingCUDAExternalProvider
                TARGETVARNAME targetName)
 
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
index 8f61ffaf1bea3..50d8be7d7f717 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
@@ -45,13 +45,10 @@ using utils::checkGPUError;
 
 void* DefaultGPUAllocator::allocate(size_t size)
 {
-  LOGP(info, "Called DefaultGPUAllocator::allocate with size {}", size);
+  LOGP(fatal, "Called DefaultGPUAllocator::allocate with size {}", size);
   return nullptr; // to be implemented
 }
 
-/////////////////////////////////////////////////////////////////////////////////////////
-// TimeFrameGPU
-/////////////////////////////////////////////////////////////////////////////////////////
 template <int nLayers>
 TimeFrameGPU<nLayers>::TimeFrameGPU()
 {
@@ -79,105 +76,6 @@ void TimeFrameGPU<nLayers>::setDevicePropagator(const o2::base::PropagatorImpl<f
   mPropagatorDevice = propagator;
 }
 
-template <int nLayers>
-void TimeFrameGPU<nLayers>::registerHostMemory(const int maxLayers)
-{
-  if (mHostRegistered) {
-    return;
-  } else {
-    mHostRegistered = true;
-  }
-  for (auto iLayer{0}; iLayer < maxLayers; ++iLayer) {
-    checkGPUError(cudaHostRegister(mClusters[iLayer].data(), mClusters[iLayer].size() * sizeof(Cluster), cudaHostRegisterPortable));
-    checkGPUError(cudaHostRegister(mNClustersPerROF[iLayer].data(), mNClustersPerROF[iLayer].size() * sizeof(int), cudaHostRegisterPortable));
-    checkGPUError(cudaHostRegister(mIndexTables[iLayer].data(), (mStaticTrackingParams.ZBins * mStaticTrackingParams.PhiBins + 1) * mNrof * sizeof(int), cudaHostRegisterPortable));
-  }
-  checkGPUError(cudaHostRegister(mHostNTracklets.data(), (nLayers - 1) * mGpuParams.nTimeFrameChunks * sizeof(int), cudaHostRegisterPortable));
-  checkGPUError(cudaHostRegister(mHostNCells.data(), (nLayers - 2) * mGpuParams.nTimeFrameChunks * sizeof(int), cudaHostRegisterPortable));
-}
-
-template <int nLayers>
-void TimeFrameGPU<nLayers>::unregisterHostMemory(const int maxLayers)
-{
-  for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
-    checkGPUError(cudaHostUnregister(mTrackingFrameInfo[iLayer].data()));
-  }
-  checkGPUError(cudaHostUnregister(mTrackingFrameInfoDevice.data()));
-}
-
-template <int nLayers>
-void TimeFrameGPU<nLayers>::initialise(const int iteration,
-                                       const TrackingParameters& trkParam,
-                                       const int maxLayers,
-                                       IndexTableUtils* utils,
-                                       const TimeFrameGPUParameters* gpuParam)
-{
-  mGpuStreams.resize(mGpuParams.nTimeFrameChunks);
-  o2::its::TimeFrame::initialise(iteration, trkParam, maxLayers);
-}
-
-template <int nLayers>
-void TimeFrameGPU<nLayers>::wipe(const int maxLayers)
-{
-  unregisterHostMemory(maxLayers);
-}
-
-template <int nLayers>
-void TimeFrameGPU<nLayers>::initDevice(IndexTableUtils* utils,
-                                       const TrackingParameters& trkParam,
-                                       const TimeFrameGPUParameters& gpuParam,
-                                       const int maxLayers,
-                                       const int iteration)
-{
-  // mStaticTrackingParams.ZBins = trkParam.ZBins;
-  // mStaticTrackingParams.PhiBins = trkParam.PhiBins;
-  // if (mFirstInit) {
-  //   mGpuParams = gpuParam;
-  //   allocMemAsync(reinterpret_cast<void**>(&mTrackingParamsDevice), sizeof(gpu::StaticTrackingParameters<nLayers>), nullptr, true);
-  //   checkGPUError(cudaMemcpy(mTrackingParamsDevice, &mStaticTrackingParams, sizeof(gpu::StaticTrackingParameters<nLayers>), cudaMemcpyHostToDevice));
-  //   if (utils) { // If utils is not nullptr, then its gpu vertexing
-  //     mIndexTableUtils = *utils;
-  //     allocMemAsync(reinterpret_cast<void**>(&mIndexTableUtilsDevice), sizeof(IndexTableUtils), nullptr, true);
-  //   } else { // GPU tracking otherwise
-  //     mIndexTableUtils.setTrackingParameters(trkParam);
-  //   }
-
-  // mMemChunks.resize(mGpuParams.nTimeFrameChunks, GpuTimeFrameChunk<nLayers>{static_cast<TimeFrame*>(this), mGpuParams});
-  // mVerticesInChunks.resize(mGpuParams.nTimeFrameChunks);
-  // mNVerticesInChunks.resize(mGpuParams.nTimeFrameChunks);
-  // mLabelsInChunks.resize(mGpuParams.nTimeFrameChunks);
-  // LOGP(info, "Size of fixed part is: {} MB", GpuTimeFrameChunk<nLayers>::computeFixedSizeBytes(mGpuParams) / MB);
-  // LOGP(info, "Size of scaling part is: {} MB", GpuTimeFrameChunk<nLayers>::computeScalingSizeBytes(GpuTimeFrameChunk<nLayers>::computeRofPerChunk(mGpuParams, mAvailMemGB), mGpuParams) / MB);
-  // LOGP(info, "Allocating {} chunks of {} rofs capacity each.", mGpuParams.nTimeFrameChunks, mGpuParams.nROFsPerChunk);
-
-  // for (int iChunk{0}; iChunk < mMemChunks.size(); ++iChunk) {
-  //   mMemChunks[iChunk].allocate(GpuTimeFrameChunk<nLayers>::computeRofPerChunk(mGpuParams, mGpuParams.maxGPUMemoryGB), mGpuStreams[iChunk]);
-  // }
-  //   for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
-  //     allocMemAsync(reinterpret_cast<void**>(&mROframesClustersDevice[iLayer]), mROframesClusters[iLayer].size() * sizeof(int), nullptr, true);
-  //     allocMemAsync(reinterpret_cast<void**>(&(mUsedClustersDevice[iLayer])), sizeof(unsigned char) * mGpuParams.clustersPerROfCapacity * mNrof, nullptr, true);
-  //   }
-  //   allocMemAsync(reinterpret_cast<void**>(&mVerticesDevice), sizeof(Vertex) * mGpuParams.maxVerticesCapacity, nullptr, true);
-  //   allocMemAsync(reinterpret_cast<void**>(&mROframesPVDevice), sizeof(int) * (mNrof + 1), nullptr, true);
-
-  //   mFirstInit = false;
-  // }
-  // if (maxLayers < nLayers) { // Vertexer
-  //   for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
-  //     checkGPUError(cudaMemcpy(mROframesClustersDevice[iLayer], mROframesClusters[iLayer].data(), mROframesClusters[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice));
-  //   }
-  // } else { // Tracker
-  //   checkGPUError(cudaMemcpy(mVerticesDevice, mPrimaryVertices.data(), sizeof(Vertex) * mPrimaryVertices.size(), cudaMemcpyHostToDevice));
-  //   checkGPUError(cudaMemcpy(mROframesPVDevice, mROframesPV.data(), sizeof(int) * mROframesPV.size(), cudaMemcpyHostToDevice));
-  //   if (!iteration) {
-  //     for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
-  //       checkGPUError(cudaMemset(mUsedClustersDevice[iLayer], 0, sizeof(unsigned char) * mGpuParams.clustersPerROfCapacity * mNrof));
-  //     }
-  //   }
-  // }
-  // checkGPUError(cudaMemcpy(mIndexTableUtilsDevice, &mIndexTableUtils, sizeof(IndexTableUtils), cudaMemcpyHostToDevice));
-}
-
 template <int nLayers>
 void TimeFrameGPU<nLayers>::loadUnsortedClustersDevice()
 {
@@ -246,6 +144,7 @@ void TimeFrameGPU<nLayers>::loadCellsDevice()
   for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) {
     LOGP(info, "gpu-transfer: loading {} cell seeds on layer {}, for {} MB.", mCells[iLayer].size(), iLayer, mCells[iLayer].size() * sizeof(CellSeed) / MB);
     allocMemAsync(reinterpret_cast<void**>(&mCellsDevice[iLayer]), mCells[iLayer].size() * sizeof(CellSeed), nullptr, getExtAllocator());
+    allocMemAsync(reinterpret_cast<void**>(&mNeighboursIndexTablesDevice[iLayer]), mCells[iLayer].size() * sizeof(int), nullptr, getExtAllocator()); // accessory for the neigh. finding.
     // Register and move data
     checkGPUError(cudaHostRegister(mCells[iLayer].data(), mCells[iLayer].size() * sizeof(CellSeed), cudaHostRegisterPortable));
     checkGPUError(cudaMemcpyAsync(mCellsDevice[iLayer], mCells[iLayer].data(), mCells[iLayer].size() * sizeof(CellSeed), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
@@ -255,6 +154,21 @@ void TimeFrameGPU<nLayers>::loadCellsDevice()
   checkGPUError(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
 }
 
+template <int nLayers>
+void TimeFrameGPU<nLayers>::loadCellsLUT()
+{
+  for (auto iLayer{0}; iLayer < nLayers - 3; ++iLayer) {
+    LOGP(info, "gpu-transfer: loading {} cell LUTs on layer {}, for {} MB.", mCellsLookupTable[iLayer].size(), iLayer, mCellsLookupTable[iLayer].size() * sizeof(int) / MB);
+    allocMemAsync(reinterpret_cast<void**>(&(mCellsLUTDevice[iLayer])), sizeof(int) * mCellsLookupTable[iLayer].size(), nullptr, getExtAllocator());
+    // Register and move data
+    checkGPUError(cudaHostRegister(mCellsLookupTable[iLayer].data(), mCellsLookupTable[iLayer].size() * sizeof(int), cudaHostRegisterPortable));
+    checkGPUError(cudaMemcpyAsync(mCellsLUTDevice[iLayer], mCellsLookupTable[iLayer].data(), mCellsLookupTable[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+  }
+  allocMemAsync(reinterpret_cast<void**>(&mCellsLUTDeviceArray), (nLayers - 2) * sizeof(int*), nullptr, getExtAllocator());
+  checkGPUError(cudaHostRegister(mCellsLUTDevice.data(), mCellsLUTDevice.size() * sizeof(int*), cudaHostRegisterPortable));
+  checkGPUError(cudaMemcpyAsync(mCellsLUTDeviceArray, mCellsLUTDevice.data(), mCellsLUTDevice.size() * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+}
+
 template <int nLayers>
 void TimeFrameGPU<nLayers>::loadRoadsDevice()
 {
@@ -274,14 +188,21 @@ void TimeFrameGPU<nLayers>::loadTrackSeedsDevice(std::vector<CellSeed>& seeds)
 }
 
 template <int nLayers>
-void TimeFrameGPU<nLayers>::createCellNeighboursDevice(const unsigned int& layer, std::vector<std::pair<int, int>>& neighbours)
+void TimeFrameGPU<nLayers>::createNeighboursDevice(const unsigned int& layer, std::vector<std::pair<int, int>>& neighbours)
 {
   mCellsNeighbours[layer].clear();
   mCellsNeighbours[layer].resize(neighbours.capacity());
   LOGP(info, "gpu-allocation: reserving {} neighbours, for {} MB.", neighbours.capacity(), neighbours.capacity() * sizeof(gpuPair<int, int>) / MB);
   allocMemAsync(reinterpret_cast<void**>(&mNeighboursDevice[layer]), neighbours.capacity() * sizeof(gpuPair<int, int>), &(mGpuStreams[0]), getExtAllocator());
-  checkGPUError(cudaMemsetAsync(mNeighboursDevice[layer], 0, neighbours.capacity() * sizeof(gpuPair<int, int>), mGpuStreams[0].get()));
-  checkGPUError(cudaHostRegister(neighbours.data(), neighbours.capacity() * sizeof(std::pair<int, int>), cudaHostRegisterPortable));
+  checkGPUError(cudaMemsetAsync(mNeighboursDevice[layer], -1, neighbours.capacity() * sizeof(gpuPair<int, int>), mGpuStreams[0].get()));
+}
+
+template <int nLayers>
+void TimeFrameGPU<nLayers>::createNeighboursLUTDevice(const int layer, const unsigned int nCells)
+{
+  LOGP(debug, "gpu-allocation: reserving {} slots for neighbours LUT, for {} MB.", nCells, nCells * sizeof(int) / MB);
+  allocMemAsync(reinterpret_cast<void**>(&mNeighboursLUTDevice[layer]), nCells * sizeof(int), nullptr, getExtAllocator()); // We need one element more to move exc -> inc
+  checkGPUError(cudaMemsetAsync(mNeighboursLUTDevice[layer], 0, nCells * sizeof(int), mGpuStreams[0].get()));
 }
 
 template <int nLayers>
@@ -295,6 +216,23 @@ void TimeFrameGPU<nLayers>::createTrackITSExtDevice(std::vector<CellSeed>& seeds
   checkGPUError(cudaHostRegister(mTrackITSExt.data(), seeds.size() * sizeof(o2::its::TrackITSExt), cudaHostRegisterPortable));
 }
 
+template <int nLayers>
+void TimeFrameGPU<nLayers>::downloadCellsNeighbours(std::vector<std::vector<std::pair<int, int>>>& neighbours, const int layer)
+{
+  LOGP(debug, "gpu-transfer: downloading {} neighbours, for {} MB.", neighbours[layer].size(), neighbours[layer].size() * sizeof(std::pair<int, int>) / MB);
+  // TOOD: something less dangerous than assuming the same memory layout of std::pair and gpuPair :)
+  checkGPUError(cudaMemcpyAsync(neighbours[layer].data(), mNeighboursDevice[layer], neighbours[layer].size() * sizeof(gpuPair<int, int>), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
+  // checkGPUError(cudaHostUnregister(neighbours[layer].data()));
+  // discardResult(cudaDeviceSynchronize());
+}
+
+template <int nLayers>
+void TimeFrameGPU<nLayers>::downloadNeighboursLUT(std::vector<int>& lut, const int layer)
+{
+  LOGP(info, "gpu-transfer: downloading {} neighbours lut, for {} MB.", lut.size(), lut.size() * sizeof(int) / MB);
+  checkGPUError(cudaMemcpyAsync(lut.data(), mNeighboursLUTDevice[layer], lut.size() * sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
+}
+
 template <int nLayers>
 void TimeFrameGPU<nLayers>::downloadTrackITSExtDevice(std::vector<CellSeed>& seeds)
 {
@@ -302,7 +240,107 @@ void TimeFrameGPU<nLayers>::downloadTrackITSExtDevice(std::vector<CellSeed>& see
   checkGPUError(cudaMemcpyAsync(mTrackITSExt.data(), mTrackITSExtDevice, seeds.size() * sizeof(o2::its::TrackITSExt), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
   checkGPUError(cudaHostUnregister(mTrackITSExt.data()));
   checkGPUError(cudaHostUnregister(seeds.data()));
-  discardResult(cudaDeviceSynchronize());
+  // discardResult(cudaDeviceSynchronize());
+}
+
+/// Legacy
+template <int nLayers>
+void TimeFrameGPU<nLayers>::registerHostMemory(const int maxLayers)
+{
+  if (mHostRegistered) {
+    return;
+  } else {
+    mHostRegistered = true;
+  }
+  for (auto iLayer{0}; iLayer < maxLayers; ++iLayer) {
+    checkGPUError(cudaHostRegister(mClusters[iLayer].data(), mClusters[iLayer].size() * sizeof(Cluster), cudaHostRegisterPortable));
+    checkGPUError(cudaHostRegister(mNClustersPerROF[iLayer].data(), mNClustersPerROF[iLayer].size() * sizeof(int), cudaHostRegisterPortable));
+    checkGPUError(cudaHostRegister(mIndexTables[iLayer].data(), (mStaticTrackingParams.ZBins * mStaticTrackingParams.PhiBins + 1) * mNrof * sizeof(int), cudaHostRegisterPortable));
+  }
+  checkGPUError(cudaHostRegister(mHostNTracklets.data(), (nLayers - 1) * mGpuParams.nTimeFrameChunks * sizeof(int), cudaHostRegisterPortable));
+  checkGPUError(cudaHostRegister(mHostNCells.data(), (nLayers - 2) * mGpuParams.nTimeFrameChunks * sizeof(int), cudaHostRegisterPortable));
+}
+
+template <int nLayers>
+void TimeFrameGPU<nLayers>::unregisterHostMemory(const int maxLayers)
+{
+  for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
+    checkGPUError(cudaHostUnregister(mTrackingFrameInfo[iLayer].data()));
+  }
+  checkGPUError(cudaHostUnregister(mTrackingFrameInfoDevice.data()));
+}
+
+template <int nLayers>
+void TimeFrameGPU<nLayers>::initialise(const int iteration,
+                                       const TrackingParameters& trkParam,
+                                       const int maxLayers,
+                                       IndexTableUtils* utils,
+                                       const TimeFrameGPUParameters* gpuParam)
+{
+  mGpuStreams.resize(mGpuParams.nTimeFrameChunks);
+  o2::its::TimeFrame::initialise(iteration, trkParam, maxLayers);
+}
+
+template <int nLayers>
+void TimeFrameGPU<nLayers>::wipe(const int maxLayers)
+{
+  unregisterHostMemory(maxLayers);
+}
+
+template <int nLayers>
+void TimeFrameGPU<nLayers>::initDevice(IndexTableUtils* utils,
+                                       const TrackingParameters& trkParam,
+                                       const TimeFrameGPUParameters& gpuParam,
+                                       const int maxLayers,
+                                       const int iteration)
+{
+  // mStaticTrackingParams.ZBins = trkParam.ZBins;
+  // mStaticTrackingParams.PhiBins = trkParam.PhiBins;
+  // if (mFirstInit) {
+  //   mGpuParams = gpuParam;
+  //   allocMemAsync(reinterpret_cast<void**>(&mTrackingParamsDevice), sizeof(gpu::StaticTrackingParameters<nLayers>), nullptr, true);
+  //   checkGPUError(cudaMemcpy(mTrackingParamsDevice, &mStaticTrackingParams, sizeof(gpu::StaticTrackingParameters<nLayers>), cudaMemcpyHostToDevice));
+  //   if (utils) { // If utils is not nullptr, then its gpu vertexing
+  //     mIndexTableUtils = *utils;
+  //     allocMemAsync(reinterpret_cast<void**>(&mIndexTableUtilsDevice), sizeof(IndexTableUtils), nullptr, true);
+  //   } else { // GPU tracking otherwise
+  //     mIndexTableUtils.setTrackingParameters(trkParam);
+  //   }
+
+  // mMemChunks.resize(mGpuParams.nTimeFrameChunks, GpuTimeFrameChunk<nLayers>{static_cast<TimeFrame*>(this), mGpuParams});
+  // mVerticesInChunks.resize(mGpuParams.nTimeFrameChunks);
+  // mNVerticesInChunks.resize(mGpuParams.nTimeFrameChunks);
+  // mLabelsInChunks.resize(mGpuParams.nTimeFrameChunks);
+  // LOGP(info, "Size of fixed part is: {} MB", GpuTimeFrameChunk<nLayers>::computeFixedSizeBytes(mGpuParams) / MB);
+  // LOGP(info, "Size of scaling part is: {} MB", GpuTimeFrameChunk<nLayers>::computeScalingSizeBytes(GpuTimeFrameChunk<nLayers>::computeRofPerChunk(mGpuParams, mAvailMemGB), mGpuParams) / MB);
+  // LOGP(info, "Allocating {} chunks of {} rofs capacity each.", mGpuParams.nTimeFrameChunks, mGpuParams.nROFsPerChunk);
+
+  // for (int iChunk{0}; iChunk < mMemChunks.size(); ++iChunk) {
+  //   mMemChunks[iChunk].allocate(GpuTimeFrameChunk<nLayers>::computeRofPerChunk(mGpuParams, mGpuParams.maxGPUMemoryGB), mGpuStreams[iChunk]);
+  // }
+  //   for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
+  //     allocMemAsync(reinterpret_cast<void**>(&mROframesClustersDevice[iLayer]), mROframesClusters[iLayer].size() * sizeof(int), nullptr, true);
+  //     allocMemAsync(reinterpret_cast<void**>(&(mUsedClustersDevice[iLayer])), sizeof(unsigned char) * mGpuParams.clustersPerROfCapacity * mNrof, nullptr, true);
+  //   }
+  //   allocMemAsync(reinterpret_cast<void**>(&mVerticesDevice), sizeof(Vertex) * mGpuParams.maxVerticesCapacity, nullptr, true);
+  //   allocMemAsync(reinterpret_cast<void**>(&mROframesPVDevice), sizeof(int) * (mNrof + 1), nullptr, true);
+
+  //   mFirstInit = false;
+  // }
+  // if (maxLayers < nLayers) { // Vertexer
+  //   for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
+  //     checkGPUError(cudaMemcpy(mROframesClustersDevice[iLayer], mROframesClusters[iLayer].data(), mROframesClusters[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice));
+  //   }
+  // } else { // Tracker
+  //   checkGPUError(cudaMemcpy(mVerticesDevice, mPrimaryVertices.data(), sizeof(Vertex) * mPrimaryVertices.size(), cudaMemcpyHostToDevice));
+  //   checkGPUError(cudaMemcpy(mROframesPVDevice, mROframesPV.data(), sizeof(int) * mROframesPV.size(), cudaMemcpyHostToDevice));
+  //   if (!iteration) {
+  //     for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
+  //       checkGPUError(cudaMemset(mUsedClustersDevice[iLayer], 0, sizeof(unsigned char) * mGpuParams.clustersPerROfCapacity * mNrof));
+  //     }
+  //   }
+  // }
+  // checkGPUError(cudaMemcpy(mIndexTableUtilsDevice, &mIndexTableUtils, sizeof(IndexTableUtils), cudaMemcpyHostToDevice));
 }
 
 template <int nLayers>
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
index 984f39dece4f1..9268802265b08 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
@@ -324,11 +324,15 @@ void TrackerTraitsGPU<nLayers>::computeCellsHybrid(const int iteration)
 template <int nLayers>
 void TrackerTraitsGPU<nLayers>::findCellsNeighboursHybrid(const int iteration)
 {
-  // TrackerTraits::findCellsNeighbours(iteration);
+  auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance();
+  mTimeFrameGPU->loadCellsDevice();
+  mTimeFrameGPU->loadCellsLUT();
+  std::vector<std::vector<std::pair<int, int>>> cellsNeighboursLayer(mTrkParams[iteration].CellsPerRoad() - 1);
   for (int iLayer{0}; iLayer < mTrkParams[iteration].CellsPerRoad() - 1; ++iLayer) {
     const int nextLayerCellsNum{static_cast<int>(mTimeFrameGPU->getCells()[iLayer + 1].size())};
     mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].clear();
     mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].resize(nextLayerCellsNum, 0);
+
     if (mTimeFrameGPU->getCells()[iLayer + 1].empty() ||
         mTimeFrameGPU->getCellsLookupTable()[iLayer].empty()) {
       mTimeFrameGPU->getCellsNeighbours()[iLayer].clear();
@@ -336,35 +340,59 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighboursHybrid(const int iteration)
     }
 
     int layerCellsNum{static_cast<int>(mTimeFrameGPU->getCells()[iLayer].size())};
-    std::vector<std::pair<int, int>> cellsNeighbours;
-    cellsNeighbours.reserve(nextLayerCellsNum);
-    mTimeFrameGPU->loadCellsDevice();
-    mTimeFrameGPU->createCellNeighboursDevice(iLayer, cellsNeighbours);
-
-    // //   // // [...]
-    // cellNeighboursHandler<true>(mTimeFrameGPU->getDeviceArrayCells(),
-    //                             mTimeFrameGPU->getDeviceArrayNeighboursCellLUT(),
-    //                             mTimeFrameGPU->getDeviceNeighbours(iLayer), );
-    //   // //         // Compute Cell Neighbours LUT
-    //   // //         checkGPUError(cub::DeviceScan::ExclusiveSum(mTimeFrameGPU->getChunk(chunkId).getDeviceCUBTmpBuffer(),                       // d_temp_storage
-    //   // //                                                     mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->tmpCUBBufferSize, // temp_storage_bytes
-    //   // //                                                     mTimeFrameGPU->getChunk(chunkId).getDeviceCellNeigboursLookupTables(iLayer),    // d_in
-    //   // //                                                     mTimeFrameGPU->getChunk(chunkId).getDeviceCellNeigboursLookupTables(iLayer),    // d_out
-    //   // //                                                     mTimeFrameGPU->getHostNCells(chunkId)[iLayer + 1],                              // num_items
-    //   // //                                                     mTimeFrameGPU->getStream(chunkId).get()));
-
-    //   // cellsNeighboursHandler<false>(mTimeFrameGPU->getDeviceNeighbours(iLayer));
-    //   // // [...]
-
-    //   std::sort(cellsNeighbours.begin(), cellsNeighbours.end(), [](const std::pair<int, int>& a, const std::pair<int, int>& b) {
-    //     return a.second < b.second;
-    //   });
-    //   mTimeFrameGPU->getCellsNeighbours()[iLayer].clear();
-    //   mTimeFrameGPU->getCellsNeighbours()[iLayer].reserve(cellsNeighbours.size());
-    //   for (auto& cellNeighboursIndex : cellsNeighbours) {
-    //     mTimeFrameGPU->getCellsNeighbours()[iLayer].push_back(cellNeighboursIndex.first);
-    //   }
-    //   std::inclusive_scan(mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].begin(), mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].end(), mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].begin());
+    mTimeFrameGPU->createNeighboursLUTDevice(iLayer, nextLayerCellsNum);
+    countCellNeighboursHandler(mTimeFrameGPU->getDeviceArrayCells(),
+                               mTimeFrameGPU->getDeviceNeighboursLUT(iLayer), // LUT is initialised here.
+                               mTimeFrameGPU->getDeviceArrayCellsLUT(),
+                               mTimeFrameGPU->getDeviceNeighbours(iLayer),
+                               mTimeFrameGPU->getNeighboursIndexTablesDevice(iLayer),
+                               mTrkParams[0].MaxChi2ClusterAttachment,
+                               mBz,
+                               iLayer,
+                               layerCellsNum,
+                               nextLayerCellsNum,
+                               1e2,
+                               conf.nBlocks,
+                               conf.nThreads);
+    mTimeFrameGPU->downloadNeighboursLUT(mTimeFrameGPU->getCellsNeighboursLUT()[iLayer], iLayer);
+    // Get the number of found cells from LUT
+    cellsNeighboursLayer[iLayer].resize(mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].back());
+    mTimeFrameGPU->createNeighboursDevice(iLayer, cellsNeighboursLayer[iLayer]);
+    computeCellNeighboursHandler(mTimeFrameGPU->getDeviceArrayCells(),
+                                 mTimeFrameGPU->getDeviceNeighboursLUT(iLayer), // LUT is initialised here.
+                                 mTimeFrameGPU->getDeviceArrayCellsLUT(),
+                                 mTimeFrameGPU->getDeviceNeighbours(iLayer),
+                                 mTimeFrameGPU->getNeighboursIndexTablesDevice(iLayer),
+                                 mTrkParams[0].MaxChi2ClusterAttachment,
+                                 mBz,
+                                 iLayer,
+                                 layerCellsNum,
+                                 nextLayerCellsNum,
+                                 1e2,
+                                 conf.nBlocks,
+                                 conf.nThreads);
+    mTimeFrameGPU->getCellsNeighbours()[iLayer].clear();
+    mTimeFrameGPU->getCellsNeighbours()[iLayer].reserve(cellsNeighboursLayer[iLayer].size());
+    mTimeFrameGPU->downloadCellsNeighbours(cellsNeighboursLayer, iLayer);
+    std::sort(cellsNeighboursLayer[iLayer].begin(), cellsNeighboursLayer[iLayer].end(), [](const std::pair<int, int>& a, const std::pair<int, int>& b) {
+      return a.second < b.second;
+    });
+    for (auto& cellNeighboursIndex : cellsNeighboursLayer[iLayer]) {
+      if (cellNeighboursIndex.first != -1) {
+        mTimeFrameGPU->getCellsNeighbours()[iLayer].push_back(cellNeighboursIndex.first);
+      }
+    }
+
+    int c = 0;
+    std::cout << " ==== Layer " << iLayer << " ====" << std::endl;
+    for (auto& h : mTimeFrameGPU->getCellsNeighboursLUT()[iLayer]) {
+      std::cout << h << "\t";
+      if (!(++c % 150)) {
+        std::cout << std::endl;
+      }
+    }
+    std::cout << std::endl;
+    // std::inclusive_scan(mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].begin(), mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].end(), mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].begin());
   }
 };
 
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
index 9ccdbf510d09f..4993892bc6e68 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
@@ -262,20 +262,21 @@ GPUg() void processNeighboursKernel(const CellSeed* currentCellSeed,
 template <bool initRun, int nLayers = 7> // Version for new tracker to supersede the old one
 GPUg() void computeLayerCellNeighboursKernel(
   CellSeed** cellSeedArray,
-  int** neighboursLUT,
-  const int* cellsNextLayerLUT,
+  int* neighboursLUT,
+  int* neighboursIndexTable,
+  int** cellsLUTs,
   gpuPair<int, int>* cellNeighbours,
   const float maxChi2ClusterAttachment,
   const float bz,
   const int layerIndex,
-  const int* nCells,
+  const unsigned int nCells,
   const int maxCellNeighbours = 1e2)
 {
-  for (int iCurrentCellIndex = blockIdx.x * blockDim.x + threadIdx.x; iCurrentCellIndex < nCells[layerIndex]; iCurrentCellIndex += blockDim.x * gridDim.x) {
+  for (int iCurrentCellIndex = blockIdx.x * blockDim.x + threadIdx.x; iCurrentCellIndex < nCells; iCurrentCellIndex += blockDim.x * gridDim.x) {
     const auto& currentCellSeed{cellSeedArray[layerIndex][iCurrentCellIndex]};
     const int nextLayerTrackletIndex{currentCellSeed.getSecondTrackletIndex()};
-    const int nextLayerFirstCellIndex{cellsNextLayerLUT[nextLayerTrackletIndex]};
-    const int nextLayerLastCellIndex{cellsNextLayerLUT[nextLayerTrackletIndex + 1]};
+    const int nextLayerFirstCellIndex{cellsLUTs[layerIndex][nextLayerTrackletIndex]};
+    const int nextLayerLastCellIndex{cellsLUTs[layerIndex][nextLayerTrackletIndex + 1]};
     int foundNeighbours{0};
     for (int iNextCell{nextLayerFirstCellIndex}; iNextCell < nextLayerLastCellIndex; ++iNextCell) {
       CellSeed nextCellSeed{cellSeedArray[layerIndex + 1][iNextCell]};      // Copy
@@ -292,13 +293,11 @@ GPUg() void computeLayerCellNeighboursKernel(
         continue;
       }
       if constexpr (initRun) {
-        atomicAdd(neighboursLUT[layerIndex] + iNextCell, 1);
+        atomicAdd(neighboursLUT + iNextCell, 1);
+        foundNeighbours++;
+        neighboursIndexTable[iCurrentCellIndex]++;
       } else {
-        if (foundNeighbours >= maxCellNeighbours) {
-          printf("its-gpu-neighbours-finder: data loss on layer: %d: number of neightbours exceeded the threshold!\n");
-          continue;
-        }
-        cellNeighbours[neighboursLUT[layerIndex][iNextCell] + foundNeighbours++] = {iCurrentCellIndex, iNextCell};
+        cellNeighbours[neighboursIndexTable[iCurrentCellIndex] + foundNeighbours++] = {iCurrentCellIndex, iNextCell};
 
         // FIXME: this is prone to race conditions: check on level is not atomic
         const int currentCellLevel{currentCellSeed.getLevel()};
@@ -673,7 +672,7 @@ GPUg() void computeLayerCellsKernel(
   const int* trackletsCurrentLayerLUT,
   const int nTrackletsCurrent,
   CellSeed* cells,
-  int* cellsLUT,
+  int* cellsLUTs,
   const StaticTrackingParameters<nLayers>* trkPars)
 {
   for (int iCurrentTrackletIndex = blockIdx.x * blockDim.x + threadIdx.x; iCurrentTrackletIndex < nTrackletsCurrent; iCurrentTrackletIndex += blockDim.x * gridDim.x) {
@@ -694,17 +693,17 @@ GPUg() void computeLayerCellsKernel(
 
       if (deltaTanLambda / trkPars->CellDeltaTanLambdaSigma < trkPars->NSigmaCut) {
         if constexpr (!initRun) {
-          new (cells + cellsLUT[iCurrentTrackletIndex] + foundCells) Cell{currentTracklet.firstClusterIndex, nextTracklet.firstClusterIndex,
-                                                                          nextTracklet.secondClusterIndex,
-                                                                          iCurrentTrackletIndex,
-                                                                          iNextTrackletIndex};
+          new (cells + cellsLUTs[iCurrentTrackletIndex] + foundCells) Cell{currentTracklet.firstClusterIndex, nextTracklet.firstClusterIndex,
+                                                                           nextTracklet.secondClusterIndex,
+                                                                           iCurrentTrackletIndex,
+                                                                           iNextTrackletIndex};
         }
         ++foundCells;
       }
     }
     if constexpr (initRun) {
       // Fill cell Lookup table
-      cellsLUT[iCurrentTrackletIndex] = foundCells;
+      cellsLUTs[iCurrentTrackletIndex] = foundCells;
     }
   }
 }
@@ -759,29 +758,94 @@ GPUg() void computeLayerRoadsKernel(
 }
 } // namespace gpu
 
-template <bool isInit>
-void cellNeighboursHandler(CellSeed** cellsLayersDevice,
-                           int** neighboursLUTs,
-                           const int* cellsNextLayerLUT,
-                           gpuPair<int, int>* cellNeighbours,
-                           const float maxChi2ClusterAttachment,
-                           const float bz,
-                           const int layerIndex,
-                           const int* nCells,
-                           const int maxCellNeighbours,
-                           const int nBlocks,
-                           const int nThreads)
+void countCellNeighboursHandler(CellSeed** cellsLayersDevice,
+                                int* neighboursLUT,
+                                int** cellsLUTs,
+                                gpuPair<int, int>* cellNeighbours,
+                                int* neighboursIndexTable,
+                                const float maxChi2ClusterAttachment,
+                                const float bz,
+                                const int layerIndex,
+                                const unsigned int nCells,
+                                const unsigned int nCellsNext,
+                                const int maxCellNeighbours,
+                                const int nBlocks,
+                                const int nThreads)
+{
+  gpu::computeLayerCellNeighboursKernel<true><<<nBlocks, nThreads>>>(
+    cellsLayersDevice,
+    neighboursLUT,
+    neighboursIndexTable,
+    cellsLUTs,
+    cellNeighbours,
+
+    maxChi2ClusterAttachment,
+    bz,
+    layerIndex,
+    nCells,
+    maxCellNeighbours);
+  void *d_temp_storage = nullptr, *d_temp_storage_2 = nullptr;
+  size_t temp_storage_bytes = 0, temp_storage_bytes_2 = 0;
+  // gpu::printBufferLayerOnThread<<<1, 1>>>(layerIndex, neighboursLUT, nCellsNext + 1);
+  gpuCheckError(cub::DeviceScan::InclusiveSum(d_temp_storage,     // d_temp_storage
+                                              temp_storage_bytes, // temp_storage_bytes
+                                              neighboursLUT,      // d_in
+                                              neighboursLUT,      // d_out
+                                              nCellsNext + 1));   // num_items
+
+  discardResult(cudaMalloc(&d_temp_storage, temp_storage_bytes));
+  gpuCheckError(cub::DeviceScan::InclusiveSum(d_temp_storage,     // d_temp_storage
+                                              temp_storage_bytes, // temp_storage_bytes
+                                              neighboursLUT,      // d_in
+                                              neighboursLUT,      // d_out
+                                              nCellsNext + 1));   // num_items
+
+  gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage_2,     // d_temp_storage
+                                              temp_storage_bytes_2, // temp_storage_bytes
+                                              neighboursIndexTable, // d_in
+                                              neighboursIndexTable, // d_out
+                                              nCells,               // num_items
+                                              0));
+  discardResult(cudaMalloc(&d_temp_storage_2, temp_storage_bytes_2));
+  gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage_2,     // d_temp_storage
+                                              temp_storage_bytes_2, // temp_storage_bytes
+                                              neighboursIndexTable, // d_in
+                                              neighboursIndexTable, // d_out
+                                              nCells,               // num_items
+                                              0));
+  gpu::printBufferLayerOnThread<<<1, 1>>>(layerIndex, neighboursLUT, nCellsNext + 1);
+  gpuCheckError(cudaPeekAtLastError());
+  gpuCheckError(cudaDeviceSynchronize());
+}
+
+void computeCellNeighboursHandler(CellSeed** cellsLayersDevice,
+                                  int* neighboursLUT,
+                                  int** cellsLUTs,
+                                  gpuPair<int, int>* cellNeighbours,
+                                  int* neighboursIndexTable,
+                                  const float maxChi2ClusterAttachment,
+                                  const float bz,
+                                  const int layerIndex,
+                                  const unsigned int nCells,
+                                  const unsigned int nCellsNext,
+                                  const int maxCellNeighbours,
+                                  const int nBlocks,
+                                  const int nThreads)
 {
-  gpu::computeLayerCellNeighboursKernel<isInit><<<nBlocks, nThreads>>>(
+  gpu::computeLayerCellNeighboursKernel<false><<<nBlocks, nThreads>>>(
     cellsLayersDevice,
-    neighboursLUTs,
-    cellsNextLayerLUT,
+    neighboursLUT,
+    neighboursIndexTable,
+    cellsLUTs,
     cellNeighbours,
     maxChi2ClusterAttachment,
     bz,
     layerIndex,
     nCells,
     maxCellNeighbours);
+  gpuCheckError(cudaPeekAtLastError());
+  gpuCheckError(cudaDeviceSynchronize());
+  // Eventually, here will go the sorting.
 }
 
 void trackSeedHandler(CellSeed* trackSeeds,
diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h
index 08e50cf9ea824..906eb0fa5c21e 100644
--- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h
+++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h
@@ -288,6 +288,7 @@ class TimeFrame
   std::vector<Road<5>> mRoads;
   std::vector<std::vector<TrackITSExt>> mTracks;
   std::vector<std::vector<int>> mCellsNeighbours;
+  std::vector<std::vector<int>> mCellsLookupTable;
 
   const o2::base::PropagatorImpl<float>* mPropagatorDevice = nullptr; // Needed only for GPU
  protected:
@@ -315,7 +316,6 @@ class TimeFrame
   std::vector<std::array<float, 2>> mPValphaX; /// PV x and alpha for track propagation
   std::vector<std::vector<MCCompLabel>> mTrackletLabels;
   std::vector<std::vector<MCCompLabel>> mCellLabels;
-  std::vector<std::vector<int>> mCellsLookupTable;
   std::vector<std::vector<int>> mCellsNeighboursLUT;
   std::vector<std::vector<MCCompLabel>> mTracksLabel;
   std::vector<int> mBogusClusters; /// keep track of clusters with wild coordinates
diff --git a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx
index 4457d4515e0a6..4fe59bcfc4729 100644
--- a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx
+++ b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx
@@ -450,6 +450,7 @@ void TrackerTraits::findCellsNeighbours(const int iteration)
         }
 
         mTimeFrame->getCellsNeighboursLUT()[iLayer][iNextCell]++;
+        printf("%d -> %d\n", iCell, iNextCell);
         cellsNeighbours.push_back(std::make_pair(iCell, iNextCell));
         const int currentCellLevel{currentCellSeed.getLevel()};
 
@@ -467,6 +468,15 @@ void TrackerTraits::findCellsNeighbours(const int iteration)
       mTimeFrame->getCellsNeighbours()[iLayer].push_back(cellNeighboursIndex.first);
     }
     std::inclusive_scan(mTimeFrame->getCellsNeighboursLUT()[iLayer].begin(), mTimeFrame->getCellsNeighboursLUT()[iLayer].end(), mTimeFrame->getCellsNeighboursLUT()[iLayer].begin());
+    // int c = 0;
+    // std::cout << " ==== Layer " << iLayer << " ====" << std::endl;
+    // for (auto& h : mTimeFrame->getCellsNeighboursLUT()[iLayer]) {
+    //   std::cout << h << "\t";
+    //   if (!(++c % 150)) {
+    //     std::cout << std::endl;
+    //   }
+    // }
+    // std::cout << std::endl;
   }
 }