From 51e7404c89aeb0b459324baac7c21122e10b432f Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Wed, 20 Nov 2024 18:20:31 +0100
Subject: [PATCH] Checkpointing

---
 .../GPU/ITStrackingGPU/TimeFrameGPU.h         |  13 +-
 .../GPU/ITStrackingGPU/TrackingKernels.h      |  12 +-
 .../ITS/tracking/GPU/ITStrackingGPU/Utils.h   |  16 +-
 .../ITS/tracking/GPU/cuda/TimeFrameGPU.cu     |  64 +++-
 .../tracking/GPU/cuda/TrackerTraitsGPU.cxx    |  16 +-
 .../ITS/tracking/GPU/cuda/TrackingKernels.cu  | 295 ++++++++++--------
 .../tracking/include/ITStracking/TimeFrame.h  |   5 +-
 7 files changed, 269 insertions(+), 152 deletions(-)
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h
index 6b5d32dc1c17a..8f2ba9dd6ea58 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h
@@ -51,10 +51,14 @@ class TimeFrameGPU : public TimeFrame
   void initialise(const int, const TrackingParameters&, const int, IndexTableUtils* utils = nullptr, const TimeFrameGPUParameters* pars = nullptr);
   void initDevice(IndexTableUtils*, const TrackingParameters& trkParam, const TimeFrameGPUParameters&, const int, const int);
   void initDeviceSAFitting();
+  void loadIndexTableUtils(const int);
   void loadTrackingFrameInfoDevice(const int);
   void loadUnsortedClustersDevice(const int);
   void loadClustersDevice(const int);
-  void loadROframeClustersDevice(const int iteration);
+  void loadClustersIndexTables(const int iteration);
+  void createUsedClustersDevice(const int);
+  void loadUsedClustersDevice();
+  void loadROframeClustersDevice(const int);
   void loadMultiplicityCutMask(const int);
   void loadVertices(const int);
 
@@ -112,6 +116,8 @@ class TimeFrameGPU : public TimeFrame
   const TrackingFrameInfo** getDeviceArrayTrackingFrameInfo() const { return mTrackingFrameInfoDeviceArray; }
   const Cluster** getDeviceArrayClusters() const { return mClustersDeviceArray; }
   const Cluster** getDeviceArrayUnsortedClusters() const { return mUnsortedClustersDeviceArray; }
+  const int** getDeviceArrayClustersIndexTables() const { return mClustersIndexTablesDeviceArray; }
+  const unsigned char** getDeviceArrayUsedClusters() const { return mUsedClustersDeviceArray; }
   const int** getDeviceROframeClusters() const { return mROFrameClustersDeviceArray; }
   const Tracklet** getDeviceArrayTracklets() const { return mTrackletsDeviceArray; }
   const int** getDeviceArrayTrackletsLUT() const { return mTrackletsLUTDeviceArray; }
@@ -148,7 +154,6 @@ class TimeFrameGPU : public TimeFrame
   // Device pointers
   StaticTrackingParameters<nLayers>* mTrackingParamsDevice;
   IndexTableUtils* mIndexTableUtilsDevice;
-  std::array<unsigned char*, nLayers> mUsedClustersDevice;
 
   // Hybrid pref
   uint8_t* mMultMaskDevice;
@@ -156,9 +161,13 @@ class TimeFrameGPU : public TimeFrame
   int* mROFramesPVDevice;
   std::array<Cluster*, nLayers> mClustersDevice;
   std::array<Cluster*, nLayers> mUnsortedClustersDevice;
+  std::array<int*, nLayers> mClustersIndexTablesDevice;
+  std::array<unsigned char*, nLayers> mUsedClustersDevice;
   std::array<int*, nLayers> mROFramesClustersDevice;
   const Cluster** mClustersDeviceArray;
   const Cluster** mUnsortedClustersDeviceArray;
+  const int** mClustersIndexTablesDeviceArray;
+  const unsigned char** mUsedClustersDeviceArray;
   const int** mROFrameClustersDeviceArray;
   std::array<Tracklet*, nLayers - 1> mTrackletsDevice;
   const Tracklet** mTrackletsDeviceArray;
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h
index 5b70d571b9b52..0496635f8898b 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h
@@ -51,7 +51,8 @@ GPUg() void fitTrackSeedsKernel(
 } // namespace gpu
 
 template <int nLayers = 7>
-void computeTrackletsInRofsHandler(const uint8_t* multMask,
+void computeTrackletsInROFsHandler(const IndexTableUtils* utils,
+                                   const uint8_t* multMask,
                                    const int startROF,
                                    const int endROF,
                                    const int maxROF,
@@ -62,6 +63,15 @@ void computeTrackletsInRofsHandler(const uint8_t* multMask,
                                    const int nVertices,
                                    const Cluster** clusters,
                                    const int** ROFClusters,
+                                   const unsigned char** usedClusters,
+                                   const int** clustersIndexTables,
+                                   const int iteration,
+                                   const float NSigmaCut,
+                                   std::vector<float>& phiCuts,
+                                   const float resolutionPV,
+                                   std::vector<float>& minR,
+                                   std::vector<float>& maxR,
+                                   std::vector<float>& resolutions,
                                    std::vector<float>& radii,
                                    std::vector<float>& mulScatAng,
                                    const int nBlocks,
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h
index cc45e24a8cbdb..a88e51742e84a 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h
@@ -39,9 +39,9 @@ struct gpuSpan {
   using ref = T&;
 
   GPUd() gpuSpan() : _data(nullptr), _size(0) {}
-  GPUd() gpuSpan(ptr data, std::size_t dim) : _data(data), _size(dim) {}
-  GPUd() ref operator[](std::size_t idx) const { return _data[idx]; }
-  GPUd() std::size_t size() const { return _size; }
+  GPUd() gpuSpan(ptr data, unsigned int dim) : _data(data), _size(dim) {}
+  GPUd() ref operator[](unsigned int idx) const { return _data[idx]; }
+  GPUd() unsigned int size() const { return _size; }
   GPUd() bool empty() const { return _size == 0; }
   GPUd() ref front() const { return _data[0]; }
   GPUd() ref back() const { return _data[_size - 1]; }
@@ -50,7 +50,7 @@ struct gpuSpan {
 
  protected:
   ptr _data;
-  std::size_t _size;
+  unsigned int _size;
 };
 
 template <typename T>
@@ -60,10 +60,10 @@ struct gpuSpan<const T> {
   using ref = const T&;
 
   GPUd() gpuSpan() : _data(nullptr), _size(0) {}
-  GPUd() gpuSpan(ptr data, std::size_t dim) : _data(data), _size(dim) {}
+  GPUd() gpuSpan(ptr data, unsigned int dim) : _data(data), _size(dim) {}
   GPUd() gpuSpan(const gpuSpan<T>& other) : _data(other._data), _size(other._size) {}
-  GPUd() ref operator[](std::size_t idx) const { return _data[idx]; }
-  GPUd() std::size_t size() const { return _size; }
+  GPUd() ref operator[](unsigned int idx) const { return _data[idx]; }
+  GPUd() unsigned int size() const { return _size; }
   GPUd() bool empty() const { return _size == 0; }
   GPUd() ref front() const { return _data[0]; }
   GPUd() ref back() const { return _data[_size - 1]; }
@@ -72,7 +72,7 @@ struct gpuSpan<const T> {
 
  protected:
   ptr _data;
-  std::size_t _size;
+  unsigned int _size;
 };
 
 enum class Task {
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
index 0db970b2361ab..0b30d7af99246 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
@@ -92,6 +92,19 @@ void TimeFrameGPU<nLayers>::setDevicePropagator(const o2::base::PropagatorImpl<f
   mPropagatorDevice = propagator;
 }
 
+template <int nLayers>
+void TimeFrameGPU<nLayers>::loadIndexTableUtils(const int iteration)
+{
+  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading indextable utils");
+  if (!iteration) {
+    LOGP(debug, "gpu-allocation: allocating IndexTableUtils buffer, for {} MB.", sizeof(IndexTableUtils) / MB);
+    allocMemAsync(reinterpret_cast<void**>(&mIndexTableUtilsDevice), sizeof(IndexTableUtils), nullptr, getExtAllocator());
+  }
+  LOGP(debug, "gpu-transfer: loading IndexTableUtils object, for {} MB.", sizeof(IndexTableUtils) / MB);
+  checkGPUError(cudaMemcpyAsync(mIndexTableUtilsDevice, &mIndexTableUtils, sizeof(IndexTableUtils), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
+}
+
 template <int nLayers>
 void TimeFrameGPU<nLayers>::loadUnsortedClustersDevice(const int iteration)
 {
@@ -128,13 +141,56 @@ void TimeFrameGPU<nLayers>::loadClustersDevice(const int iteration)
   }
 }
 
+template <int nLayers>
+void TimeFrameGPU<nLayers>::loadClustersIndexTables(const int iteration)
+{
+  if (!iteration) {
+    START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading sorted clusters");
+    for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
+      LOGP(info, "gpu-transfer: loading clusters indextable for layer {} with {} elements, for {} MB.", iLayer, mIndexTables[iLayer].size(), mIndexTables[iLayer].size() * sizeof(int) / MB);
+      allocMemAsync(reinterpret_cast<void**>(&mClustersIndexTablesDevice[iLayer]), mIndexTables[iLayer].size() * sizeof(int), nullptr, getExtAllocator());
+      checkGPUError(cudaMemcpyAsync(mClustersIndexTablesDevice[iLayer], mIndexTables[iLayer].data(), mIndexTables[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+    }
+    allocMemAsync(reinterpret_cast<void**>(&mClustersIndexTablesDeviceArray), nLayers * sizeof(int), nullptr, getExtAllocator());
+    checkGPUError(cudaMemcpyAsync(mClustersIndexTablesDeviceArray, mClustersIndexTablesDevice.data(), nLayers * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+    STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
+  }
+}
+
+template <int nLayers>
+void TimeFrameGPU<nLayers>::createUsedClustersDevice(const int iteration)
+{
+  if (!iteration) {
+    START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating used clusters flags");
+    for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
+      LOGP(debug, "gpu-transfer: creating {} used clusters flags on layer {}, for {} MB.", mUsedClusters[iLayer].size(), iLayer, mUsedClusters[iLayer].size() * sizeof(unsigned char) / MB);
+      allocMemAsync(reinterpret_cast<void**>(&mUsedClustersDevice[iLayer]), mUsedClusters[iLayer].size() * sizeof(unsigned char), nullptr, getExtAllocator());
+      checkGPUError(cudaMemsetAsync(mUsedClustersDevice[iLayer], 0, mUsedClusters[iLayer].size() * sizeof(unsigned char), mGpuStreams[0].get()));
+    }
+    allocMemAsync(reinterpret_cast<void**>(&mUsedClustersDeviceArray), nLayers * sizeof(unsigned char*), nullptr, getExtAllocator());
+    checkGPUError(cudaMemcpyAsync(mUsedClustersDeviceArray, mUsedClustersDevice.data(), nLayers * sizeof(unsigned char*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+    STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
+  }
+}
+
+template <int nLayers>
+void TimeFrameGPU<nLayers>::loadUsedClustersDevice()
+{
+  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating used clusters flags");
+  for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
+    LOGP(debug, "gpu-transfer: loading {} used clusters flags on layer {}, for {} MB.", mUsedClusters[iLayer].size(), iLayer, mClusters[iLayer].size() * sizeof(unsigned char) / MB);
+    checkGPUError(cudaMemcpyAsync(mUsedClustersDevice[iLayer], mUsedClusters[iLayer].data(), mUsedClusters[iLayer].size() * sizeof(unsigned char), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+  }
+  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
+}
+
 template <int nLayers>
 void TimeFrameGPU<nLayers>::loadROframeClustersDevice(const int iteration)
 {
   if (!iteration) {
     START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading ROframe clusters");
     for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
-      LOGP(info, "gpu-transfer: loading {} ROframe clusters info on layer {}, for {} MB.", mROFramesClusters[iLayer].size(), iLayer, mROFramesClusters[iLayer].size() * sizeof(int) / MB);
+      LOGP(debug, "gpu-transfer: loading {} ROframe clusters info on layer {}, for {} MB.", mROFramesClusters[iLayer].size(), iLayer, mROFramesClusters[iLayer].size() * sizeof(int) / MB);
       allocMemAsync(reinterpret_cast<void**>(&mROFramesClustersDevice[iLayer]), mROFramesClusters[iLayer].size() * sizeof(int), nullptr, getExtAllocator());
       checkGPUError(cudaMemcpyAsync(mROFramesClustersDevice[iLayer], mROFramesClusters[iLayer].data(), mROFramesClusters[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
     }
@@ -167,7 +223,7 @@ void TimeFrameGPU<nLayers>::loadMultiplicityCutMask(const int iteration)
 {
   if (!iteration) {
     START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading multiplicity cut mask");
-    LOGP(info, "gpu-transfer: loading multiplicity cut mask with {} elements, for {} MB.", mMultiplicityCutMask.size(), mMultiplicityCutMask.size() * sizeof(bool) / MB);
+    LOGP(debug, "gpu-transfer: loading multiplicity cut mask with {} elements, for {} MB.", mMultiplicityCutMask.size(), mMultiplicityCutMask.size() * sizeof(bool) / MB);
     allocMemAsync(reinterpret_cast<void**>(&mMultMaskDevice), mMultiplicityCutMask.size() * sizeof(uint8_t), nullptr, getExtAllocator());
     checkGPUError(cudaMemcpyAsync(mMultMaskDevice, mMultiplicityCutMask.data(), mMultiplicityCutMask.size() * sizeof(uint8_t), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
     STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
@@ -179,10 +235,10 @@ void TimeFrameGPU<nLayers>::loadVertices(const int iteration)
 {
   if (!iteration) {
     START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading seeding vertices");
-    LOGP(info, "gpu-transfer: loading {} ROframes vertices, for {} MB.", mROFramesPV.size(), mROFramesPV.size() * sizeof(int) / MB);
+    LOGP(debug, "gpu-transfer: loading {} ROframes vertices, for {} MB.", mROFramesPV.size(), mROFramesPV.size() * sizeof(int) / MB);
     allocMemAsync(reinterpret_cast<void**>(&mROFramesPVDevice), mROFramesPV.size() * sizeof(int), nullptr, getExtAllocator());
     checkGPUError(cudaMemcpyAsync(mROFramesPVDevice, mROFramesPV.data(), mROFramesPV.size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
-    LOGP(info, "gpu-transfer: loading {} seeding vertices, for {} MB.", mPrimaryVertices.size(), mPrimaryVertices.size() * sizeof(Vertex) / MB);
+    LOGP(debug, "gpu-transfer: loading {} seeding vertices, for {} MB.", mPrimaryVertices.size(), mPrimaryVertices.size() * sizeof(Vertex) / MB);
     allocMemAsync(reinterpret_cast<void**>(&mPrimaryVerticesDevice), mPrimaryVertices.size() * sizeof(Vertex), nullptr, getExtAllocator());
     checkGPUError(cudaMemcpyAsync(mPrimaryVerticesDevice, mPrimaryVertices.data(), mPrimaryVertices.size() * sizeof(Vertex), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
     STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
index 8fb3f628cacc0..b83caf5b5b849 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
@@ -31,10 +31,13 @@ void TrackerTraitsGPU<nLayers>::initialiseTimeFrame(const int iteration)
   mTimeFrameGPU->initialise(iteration, mTrkParams[iteration], nLayers);
   mTimeFrameGPU->loadClustersDevice(iteration);
   mTimeFrameGPU->loadUnsortedClustersDevice(iteration);
+  mTimeFrameGPU->loadClustersIndexTables(iteration);
   mTimeFrameGPU->loadTrackingFrameInfoDevice(iteration);
   mTimeFrameGPU->loadMultiplicityCutMask(iteration);
   mTimeFrameGPU->loadVertices(iteration);
   mTimeFrameGPU->loadROframeClustersDevice(iteration);
+  mTimeFrameGPU->createUsedClustersDevice(iteration);
+  mTimeFrameGPU->loadIndexTableUtils(iteration);
 }
 
 template <int nLayers>
@@ -95,7 +98,8 @@ void TrackerTraitsGPU<nLayers>::computeTrackletsHybrid(const int iteration, int
   int startROF{mTrkParams[iteration].nROFsPerIterations > 0 ? iROFslice * mTrkParams[iteration].nROFsPerIterations : 0};
   int endROF{mTrkParams[iteration].nROFsPerIterations > 0 ? (iROFslice + 1) * mTrkParams[iteration].nROFsPerIterations + mTrkParams[iteration].DeltaROF : mTimeFrameGPU->getNrof()};
 
-  computeTrackletsInRofsHandler<nLayers>(mTimeFrameGPU->getDeviceMultCutMask(),
+  computeTrackletsInROFsHandler<nLayers>(mTimeFrameGPU->getDeviceIndexTableUtils(),
+                                         mTimeFrameGPU->getDeviceMultCutMask(),
                                          startROF,
                                          endROF,
                                          mTimeFrameGPU->getNrof(),
@@ -106,6 +110,15 @@ void TrackerTraitsGPU<nLayers>::computeTrackletsHybrid(const int iteration, int
                                          mTimeFrameGPU->getPrimaryVerticesNum(),
                                          mTimeFrameGPU->getDeviceArrayClusters(),
                                          mTimeFrameGPU->getDeviceROframeClusters(),
+                                         mTimeFrameGPU->getDeviceArrayUsedClusters(),
+                                         mTimeFrameGPU->getDeviceArrayClustersIndexTables(),
+                                         iteration,
+                                         mTrkParams[iteration].NSigmaCut,
+                                         mTimeFrameGPU->getPhiCuts(),
+                                         mTrkParams[iteration].PVres,
+                                         mTimeFrameGPU->getMinRs(),
+                                         mTimeFrameGPU->getMaxRs(),
+                                         mTimeFrameGPU->getPositionResolutions(),
                                          mTrkParams[iteration].LayerRadii,
                                          mTimeFrameGPU->getMSangles(),
                                          conf.nBlocks,
@@ -324,6 +337,7 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
       mTimeFrame->getTracks(std::min(rofs[0], rofs[1])).emplace_back(track);
     }
   }
+  mTimeFrameGPU->loadUsedClustersDevice();
   if (iteration == mTrkParams.size() - 1) {
     mTimeFrameGPU->unregisterHostMemory(0);
   }
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
index cacfa70ec2993..110595e2821a9 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
@@ -74,9 +74,36 @@ namespace o2::its
 {
 using namespace constants::its2;
 using Vertex = o2::dataformats::Vertex<o2::dataformats::TimeStamp<int>>;
+
+GPUd() float Sq(float v)
+{
+  return v * v;
+}
+
 namespace gpu
 {
 
+GPUd() const int4 getBinsRect(const Cluster& currentCluster, const int layerIndex,
+                              const o2::its::IndexTableUtils& utils,
+                              const float z1, const float z2, float maxdeltaz, float maxdeltaphi)
+{
+  const float zRangeMin = o2::gpu::CAMath::Min(z1, z2) - maxdeltaz;
+  const float phiRangeMin = (maxdeltaphi > constants::math::Pi) ? 0.f : currentCluster.phi - maxdeltaphi;
+  const float zRangeMax = o2::gpu::CAMath::Max(z1, z2) + maxdeltaz;
+  const float phiRangeMax = (maxdeltaphi > constants::math::Pi) ? constants::math::TwoPi : currentCluster.phi + maxdeltaphi;
+
+  if (zRangeMax < -LayersZCoordinate()[layerIndex + 1] ||
+      zRangeMin > LayersZCoordinate()[layerIndex + 1] || zRangeMin > zRangeMax) {
+
+    return getEmptyBinsRect();
+  }
+
+  return int4{o2::gpu::CAMath::Max(0, utils.getZBinIndex(layerIndex + 1, zRangeMin)),
+              utils.getPhiBinIndex(math_utils::getNormalizedPhi(phiRangeMin)),
+              o2::gpu::CAMath::Min(ZBins - 1, utils.getZBinIndex(layerIndex + 1, zRangeMax)),
+              utils.getPhiBinIndex(math_utils::getNormalizedPhi(phiRangeMax))};
+}
+
 GPUd() bool fitTrack(TrackITSExt& track,
                      int start,
                      int end,
@@ -199,30 +226,28 @@ struct is_valid_pair {
 
 GPUd() gpuSpan<const Vertex> getPrimaryVertices(const int rof,
                                                 const int* roframesPV,
-                                                const int nRof,
+                                                const int nROF,
                                                 const uint8_t* mask,
                                                 const Vertex* vertices)
 {
   const int start_pv_id = roframesPV[rof];
-  const int stop_rof = rof >= nRof - 1 ? nRof : rof + 1;
-  size_t delta = mask[rof] ? roframesPV[stop_rof] - start_pv_id : 0; // return empty span if Rof is excluded
+  const int stop_rof = rof >= nROF - 1 ? nROF : rof + 1;
+  size_t delta = mask[rof] ? roframesPV[stop_rof] - start_pv_id : 0; // return empty span if ROF is excluded
   return gpuSpan<const Vertex>(&vertices[start_pv_id], delta);
 };
 
 GPUd() gpuSpan<const Cluster> getClustersOnLayer(const int rof,
-                                                 const int** roframesClus,
+                                                 const int totROFs,
                                                  const int layer,
-                                                 const int nRof,
+                                                 const int** roframesClus,
                                                  const Cluster** clusters)
 {
-  const int start_clus_id{roframesClus[layer][rof]};
-  const int stop_rof = rof >= nRof - 1 ? nRof : rof + 1;
-  const int delta = roframesClus[layer][stop_rof] - start_clus_id;
-  printf("\t\t\t r: %d nr: %d rfci: %d d: %d\n", rof, nRof, roframesClus[layer][rof], delta);
-  if (rof < 0 || rof >= nRof) {
+  if (rof < 0 || rof >= totROFs) {
     return gpuSpan<const Cluster>();
   }
-
+  const int start_clus_id{roframesClus[layer][rof]};
+  const int stop_rof = rof >= totROFs - 1 ? totROFs : rof + 1;
+  const unsigned int delta = roframesClus[layer][stop_rof] - start_clus_id;
   return gpuSpan<const Cluster>(&(clusters[layer][start_clus_id]), delta);
 }
 
@@ -424,45 +449,45 @@ GPUg() void computeLayerCellsKernel(
   }
 }
 
-template <int nLayers = 7>
+template <bool initRun = true, int nLayers = 7>
 GPUg() void computeLayerTrackletsMultiROFKernel(
+  const IndexTableUtils* utils,
   const uint8_t* multMask,
   const int layerIndex,
   const int startROF,
   const int endROF,
-  const int maxRof,
-  const int deltaRof,
+  const int totalROFs,
+  const int deltaROF,
   const Vertex* vertices,
   const int* rofPV,
   const int nVertices,
   const int vertexId,
-  const Cluster** clusters, // input data rof0
-  const int** ROFClusters,  // Number of clusters on layers per ROF
-  // const int* roFrameClustersNextLayer,        // Number of clusters on layer 1 per ROF
-  // const int* indexTablesNext,                 // input data rof0-delta <rof0< rof0+delta (up to 3 rofs)
-  // const unsigned char* usedClustersLayer,     // input data rof0
-  // const unsigned char* usedClustersNextLayer, // input data rof1
+  const Cluster** clusters,           // input data rof0
+  const int** ROFClusters,            // Number of clusters on layers per ROF
+  const unsigned char** usedClusters, // Used clusters
+  const int** indexTables,            // input data rof0-delta <rof0< rof0+delta (up to 3 rofs)
   // Tracklet* tracklets,                        // output data
-  // const float phiCut,
-  // const float minR,
-  // const float maxR,
+  // int* trackletsLUT,
+  const int iteration,
+  const float NSigmaCut,
+  const float phiCut,
+  const float resolutionPV,
+  const float maxR,
+  const float minR,
+  const float positionResolution,
   const float meanDeltaR = -666.f,
-  // const float positionResolution,
-  const float mSAngle = -666.f)
+  const float MSAngle = -666.f)
 {
-  // const int phiBins{utils->getNphiBins()};
-  // const int zBins{utils->getNzBins()};
-  for (unsigned int iRof{blockIdx.x}; iRof < endROF - startROF; iRof += gridDim.x) {
-    auto rof0 = iRof + startROF;
-    auto primaryVertices = getPrimaryVertices(rof0, rofPV, maxRof, multMask, vertices);
+  const int phiBins{utils->getNphiBins()};
+  const int zBins{utils->getNzBins()};
+  for (unsigned int iROF{blockIdx.x}; iROF < endROF - startROF; iROF += gridDim.x) {
+    const int rof0 = iROF + startROF;
+    auto primaryVertices = getPrimaryVertices(rof0, rofPV, totalROFs, multMask, vertices);
     const auto startVtx{vertexId >= 0 ? vertexId : 0};
     const auto endVtx{vertexId >= 0 ? o2::gpu::CAMath::Min(vertexId + 1, static_cast<int>(primaryVertices.size())) : static_cast<int>(primaryVertices.size())};
-    auto minRof = o2::gpu::CAMath::Max(startROF, static_cast<int>(rof0 - deltaRof));
-    auto maxRof = o2::gpu::CAMath::Min(endROF - 1, static_cast<int>(rof0 + deltaRof));
-    auto clustersCurrentLayer = getClustersOnLayer(iRof, ROFClusters, layerIndex, maxRof, clusters);
-    if (threadIdx.x == 0) {
-      // printf("> l: %d r: %d rc: %d s: %d e: %d \n", layerIndex, iRof, ROFClusters[layerIndex][iRof], clustersCurrentLayer.size(), clustersCurrentLayer.empty());
-    }
+    auto minROF = o2::gpu::CAMath::Max(startROF, static_cast<int>(rof0 - deltaROF));
+    auto maxROF = o2::gpu::CAMath::Min(endROF - 1, static_cast<int>(rof0 + deltaROF));
+    auto clustersCurrentLayer = getClustersOnLayer(rof0, totalROFs, layerIndex, ROFClusters, clusters);
     if (clustersCurrentLayer.empty()) {
       continue;
     }
@@ -470,79 +495,72 @@ GPUg() void computeLayerTrackletsMultiROFKernel(
     for (int currentClusterIndex = threadIdx.x; currentClusterIndex < clustersCurrentLayer.size(); currentClusterIndex += blockDim.x) {
       unsigned int storedTracklets{0};
       auto currentCluster{clustersCurrentLayer[currentClusterIndex]};
-      if (threadIdx.x == 0) {
-        printf("rof: %d has %zu clusters on layer %d\n", rof0, clustersCurrentLayer.size(), layerIndex);
+      const int currentSortedIndex{ROFClusters[layerIndex][rof0] + currentClusterIndex};
+      if (usedClusters[layerIndex][currentSortedIndex]) {
+        continue;
       }
 
-      // const int currentSortedIndex{roFrameClustersCurrentLayer[rof0] + currentClusterIndex};
-      // const int currentSortedIndexChunk{currentSortedIndex - roFrameClustersCurrentLayer[startROF]};
-      // if (usedClustersLayer[currentSortedIndex]) {
-      // continue;
-      // }
-      //
-      // int minRof = (rof0 >= trkPars->DeltaROF) ? rof0 - trkPars->DeltaROF : 0;
-      // int maxRof = (rof0 == maxRofs - trkPars->DeltaROF) ? rof0 : rof0 + trkPars->DeltaROF; // works with delta = {0, 1}
-      // const float inverseR0{1.f / currentCluster.radius};
-      //
-      // for (int iPrimaryVertex{0}; iPrimaryVertex < nVerticesRof0; iPrimaryVertex++) {
-      // const auto& primaryVertex{vertices[nVertices[rof0] + iPrimaryVertex]};
-      // const float resolution{Sqrt(Sq(trkPars->PVres) / primaryVertex.getNContributors() + Sq(positionResolution))};
-      // const float tanLambda{(currentCluster.zCoordinate - primaryVertex.getZ()) * inverseR0};
-      // const float zAtRmin{tanLambda * (minR - currentCluster.radius) + currentCluster.zCoordinate};
-      // const float zAtRmax{tanLambda * (maxR - currentCluster.radius) + currentCluster.zCoordinate};
-      // const float sqInverseDeltaZ0{1.f / (Sq(currentCluster.zCoordinate - primaryVertex.getZ()) + 2.e-8f)}; /// protecting from overflows adding the detector resolution
-      // const float sigmaZ{o2::gpu::CAMath::Sqrt(Sq(resolution) * Sq(tanLambda) * ((Sq(inverseR0) + sqInverseDeltaZ0) * Sq(meanDeltaR) + 1.f) + Sq(meanDeltaR * mSAngle))};
-      //
-      // const int4 selectedBinsRect{getBinsRect(currentCluster, layerIndex, *utils, zAtRmin, zAtRmax, sigmaZ * trkPars->NSigmaCut, phiCut)};
-      //
-      // if (selectedBinsRect.x == 0 && selectedBinsRect.y == 0 && selectedBinsRect.z == 0 && selectedBinsRect.w == 0) {
-      // continue;
-      // }
-      // int phiBinsNum{selectedBinsRect.w - selectedBinsRect.y + 1};
-      // if (phiBinsNum < 0) {
-      // phiBinsNum += trkPars->PhiBins;
-      // }
-      // const int tableSize{phiBins * zBins + 1};
-      // for (int rof1{minRof}; rof1 <= maxRof; ++rof1) {
-      // auto nClustersNext{roFrameClustersNextLayer[rof1 + 1] - roFrameClustersNextLayer[rof1]};
-      // if (!nClustersNext) { // number of clusters on next layer > 0
-      // continue;
-      // }
-      // for (int iPhiCount{0}; iPhiCount < phiBinsNum; iPhiCount++) {
-      // int iPhiBin = (selectedBinsRect.y + iPhiCount) % trkPars->PhiBins;
-      // const int firstBinIndex{utils->getBinIndex(selectedBinsRect.x, iPhiBin)};
-      // const int maxBinIndex{firstBinIndex + selectedBinsRect.z - selectedBinsRect.x + 1};
-      // const int firstRowClusterIndex = indexTablesNext[(rof1 - startROF) * tableSize + firstBinIndex];
-      // const int maxRowClusterIndex = indexTablesNext[(rof1 - startROF) * tableSize + maxBinIndex];
-      // for (int iNextCluster{firstRowClusterIndex}; iNextCluster < maxRowClusterIndex; ++iNextCluster) {
-      // if (iNextCluster >= nClustersNext) {
-      // break;
-      // }
-      // auto nextClusterIndex{roFrameClustersNextLayer[rof1] - roFrameClustersNextLayer[startROF] + iNextCluster};
-      // const Cluster& nextCluster{clustersNextLayer[nextClusterIndex]};
-      // if (usedClustersNextLayer[nextCluster.clusterId]) {
-      // continue;
-      // }
-      // const float deltaPhi{o2::gpu::CAMath::Abs(currentCluster.phi - nextCluster.phi)};
-      // const float deltaZ{o2::gpu::CAMath::Abs(tanLambda * (nextCluster.radius - currentCluster.radius) + currentCluster.zCoordinate - nextCluster.zCoordinate)};
-      //
-      // if ((deltaZ / sigmaZ < trkPars->NSigmaCut && (deltaPhi < phiCut || o2::gpu::CAMath::Abs(deltaPhi - constants::math::TwoPi) < phiCut))) {
-      // const float phi{ATan2(currentCluster.yCoordinate - nextCluster.yCoordinate, currentCluster.xCoordinate - nextCluster.xCoordinate)};
-      // const float tanL{(currentCluster.zCoordinate - nextCluster.zCoordinate) / (currentCluster.radius - nextCluster.radius)};
-      // const unsigned int stride{currentClusterIndex * maxTrackletsPerCluster};
-      // if (storedTracklets < maxTrackletsPerCluster) {
-      // new (trackletsRof0 + stride + storedTracklets) Tracklet{currentSortedIndexChunk, nextClusterIndex, tanL, phi, static_cast<short>(rof0), static_cast<short>(rof1)};
-      // }
-      // else {
-      // printf("its-gpu-tracklet-finder: on rof %d layer: %d: found more tracklets (%d) than maximum allowed per cluster. This is lossy!\n", rof0, layerIndex, storedTracklets);
-      // }
-      // ++storedTracklets;
-      // }
-      // }
-      // }
-      // }
-      // }
-      // */
+      const float inverseR0{1.f / currentCluster.radius};
+      for (int iV{startVtx}; iV < endVtx; ++iV) {
+        auto& primaryVertex{primaryVertices[iV]};
+        if (primaryVertex.isFlagSet(2) && iteration != 3) {
+          continue;
+        }
+        const float resolution = o2::gpu::CAMath::Sqrt(Sq(resolutionPV) / primaryVertex.getNContributors() + Sq(positionResolution));
+        const float tanLambda{(currentCluster.zCoordinate - primaryVertex.getZ()) * inverseR0};
+        const float zAtRmin{tanLambda * (minR - currentCluster.radius) + currentCluster.zCoordinate};
+        const float zAtRmax{tanLambda * (maxR - currentCluster.radius) + currentCluster.zCoordinate};
+        const float sqInverseDeltaZ0{1.f / (Sq(currentCluster.zCoordinate - primaryVertex.getZ()) + 2.e-8f)}; /// protecting from overflows adding the detector resolution
+        const float sigmaZ{o2::gpu::CAMath::Sqrt(Sq(resolution) * Sq(tanLambda) * ((Sq(inverseR0) + sqInverseDeltaZ0) * Sq(meanDeltaR) + 1.f) + Sq(meanDeltaR * MSAngle))};
+        const int4 selectedBinsRect{getBinsRect(currentCluster, layerIndex, *utils, zAtRmin, zAtRmax, sigmaZ * NSigmaCut, phiCut)};
+        if (selectedBinsRect.x == 0 && selectedBinsRect.y == 0 && selectedBinsRect.z == 0 && selectedBinsRect.w == 0) {
+          continue;
+        }
+        int phiBinsNum{selectedBinsRect.w - selectedBinsRect.y + 1};
+
+        if (phiBinsNum < 0) {
+          phiBinsNum += phiBins;
+        }
+
+        const int tableSize{phiBins * zBins + 1};
+        for (int rof1{minROF}; rof1 <= maxROF; ++rof1) {
+          auto clustersNextLayer = getClustersOnLayer(rof1, totalROFs, layerIndex + 1, ROFClusters, clusters);
+          if (clustersNextLayer.empty()) {
+            continue;
+          }
+          for (int iPhiCount{0}; iPhiCount < phiBinsNum; iPhiCount++) {
+            int iPhiBin = (selectedBinsRect.y + iPhiCount) % PhiBins;
+            const int firstBinIndex{utils->getBinIndex(selectedBinsRect.x, iPhiBin)};
+            const int maxBinIndex{firstBinIndex + selectedBinsRect.z - selectedBinsRect.x + 1};
+            const int firstRowClusterIndex = indexTables[layerIndex + 1][(rof1 - startROF) * tableSize + firstBinIndex];
+            const int maxRowClusterIndex = indexTables[layerIndex + 1][(rof1 - startROF) * tableSize + maxBinIndex];
+            for (int iNextCluster{firstRowClusterIndex}; iNextCluster < maxRowClusterIndex; ++iNextCluster) {
+              if (iNextCluster >= clustersNextLayer.size()) {
+                break;
+              }
+              const Cluster& nextCluster{clustersNextLayer[iNextCluster]};
+              if (usedClusters[layerIndex + 1][nextCluster.clusterId]) {
+                continue;
+              }
+              const float deltaPhi{o2::gpu::CAMath::Abs(currentCluster.phi - nextCluster.phi)};
+              const float deltaZ{o2::gpu::CAMath::Abs(tanLambda * (nextCluster.radius - currentCluster.radius) +
+                                                      currentCluster.zCoordinate - nextCluster.zCoordinate)};
+              if (deltaZ / sigmaZ < NSigmaCut && (deltaPhi < phiCut || o2::gpu::CAMath::Abs(deltaPhi - constants::math::TwoPi) < phiCut)) {
+                // if (layerIndex > 0) {
+                if constexpr (initRun) {
+                  // trackletsLUT[currentSortedIndex]++; // we need l0 as well for usual exclusive sums.
+                } else {
+                  // }
+                  const float phi{o2::gpu::CAMath::ATan2(currentCluster.yCoordinate - nextCluster.yCoordinate, currentCluster.xCoordinate - nextCluster.xCoordinate)};
+                  const float tanL{(currentCluster.zCoordinate - nextCluster.zCoordinate) / (currentCluster.radius - nextCluster.radius)};
+                  // tf->getTracklets()[layerIndex].emplace_back(currentSortedIndex, tf->getSortedIndex(rof1, layerIndex + 1, iNextCluster), tanL, phi, rof0, rof1);
+                }
+                ++storedTracklets;
+              }
+            }
+          }
+        }
+      }
     }
   }
 }
@@ -550,26 +568,6 @@ GPUg() void computeLayerTrackletsMultiROFKernel(
 /////////////////////////////////////////
 // Debug Kernels
 /////////////////////////////////////////
-GPUd() const int4 getBinsRect(const Cluster& currentCluster, const int layerIndex,
-                              const o2::its::IndexTableUtils& utils,
-                              const float z1, const float z2, float maxdeltaz, float maxdeltaphi)
-{
-  const float zRangeMin = o2::gpu::CAMath::Min(z1, z2) - maxdeltaz;
-  const float phiRangeMin = currentCluster.phi - maxdeltaphi;
-  const float zRangeMax = o2::gpu::CAMath::Max(z1, z2) + maxdeltaz;
-  const float phiRangeMax = currentCluster.phi + maxdeltaphi;
-
-  if (zRangeMax < -LayersZCoordinate()[layerIndex + 1] ||
-      zRangeMin > LayersZCoordinate()[layerIndex + 1] || zRangeMin > zRangeMax) {
-
-    return getEmptyBinsRect();
-  }
-
-  return int4{o2::gpu::CAMath::Max(0, utils.getZBinIndex(layerIndex + 1, zRangeMin)),
-              utils.getPhiBinIndex(math_utils::getNormalizedPhi(phiRangeMin)),
-              o2::gpu::CAMath::Min(ZBins - 1, utils.getZBinIndex(layerIndex + 1, zRangeMax)),
-              utils.getPhiBinIndex(math_utils::getNormalizedPhi(phiRangeMax))};
-}
 
 template <typename T>
 GPUd() void pPointer(T* ptr)
@@ -697,7 +695,8 @@ GPUg() void removeDuplicateTrackletsEntriesLUTKernel(
 } // namespace gpu
 
 template <int nLayers>
-void computeTrackletsInRofsHandler(const uint8_t* multMask,
+void computeTrackletsInROFsHandler(const IndexTableUtils* utils,
+                                   const uint8_t* multMask,
                                    const int startROF,
                                    const int endROF,
                                    const int maxROF,
@@ -708,16 +707,23 @@ void computeTrackletsInRofsHandler(const uint8_t* multMask,
                                    const int nVertices,
                                    const Cluster** clusters,
                                    const int** ROFClusters,
+                                   const unsigned char** usedClusters,
+                                   const int** clustersIndexTables,
+                                   const int iteration,
+                                   const float NSigmaCut,
+                                   std::vector<float>& phiCuts,
+                                   const float resolutionPV,
+                                   std::vector<float>& minRs,
+                                   std::vector<float>& maxRs,
+                                   std::vector<float>& resolutions,
                                    std::vector<float>& radii,
                                    std::vector<float>& mulScatAng,
                                    const int nBlocks,
                                    const int nThreads)
 {
   for (int iLayer = 0; iLayer < nLayers - 1; ++iLayer) {
-    const auto meanDeltaR = radii[iLayer + 1] - radii[iLayer];
-    const auto mSAngle = mulScatAng[iLayer];
-    // gpu::printMatrixRow<<<1, 1>>>(iLayer, ROFClusters, maxROF);
-    gpu::computeLayerTrackletsMultiROFKernel<<<1, 1>>>(
+    gpu::computeLayerTrackletsMultiROFKernel<<<nBlocks, nThreads>>>(
+      utils,
       multMask,
       iLayer,
       startROF,
@@ -730,8 +736,17 @@ void computeTrackletsInRofsHandler(const uint8_t* multMask,
       vertexId,
       clusters,
       ROFClusters,
-      meanDeltaR,
-      mSAngle);
+      usedClusters,
+      clustersIndexTables,
+      iteration,
+      NSigmaCut,
+      phiCuts[iLayer],
+      resolutionPV,
+      minRs[iLayer + 1],
+      maxRs[iLayer + 1],
+      resolutions[iLayer],
+      radii[iLayer + 1] - radii[iLayer],
+      mulScatAng[iLayer]);
   }
 }
 
@@ -965,7 +980,8 @@ void trackSeedHandler(CellSeed* trackSeeds,
   gpuCheckError(cudaDeviceSynchronize());
 }
 
-template void computeTrackletsInRofsHandler<7>(const uint8_t* multMask,
+template void computeTrackletsInROFsHandler<7>(const IndexTableUtils* utils,
+                                               const uint8_t* multMask,
                                                const int startROF,
                                                const int endROF,
                                                const int maxROF,
@@ -976,6 +992,15 @@ template void computeTrackletsInRofsHandler<7>(const uint8_t* multMask,
                                                const int nVertices,
                                                const Cluster** clusters,
                                                const int** ROFClusters,
+                                               const unsigned char** usedClusters,
+                                               const int** clustersIndexTables,
+                                               const int iteration,
+                                               const float NSigmaCut,
+                                               std::vector<float>& phiCuts,
+                                               const float resolutionPV,
+                                               std::vector<float>& minRs,
+                                               std::vector<float>& maxRs,
+                                               std::vector<float>& resolutions,
                                                std::vector<float>& radii,
                                                std::vector<float>& mulScatAng,
                                                const int nBlocks,
diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h
index 309ca2031a9b5..fa4f33782d16a 100644
--- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h
+++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h
@@ -106,13 +106,16 @@ class TimeFrame
 
   float getBeamX() const;
   float getBeamY() const;
-
+  std::vector<float>& getMinRs() { return mMinR; }
+  std::vector<float>& getMaxRs() { return mMaxR; }
   float getMinR(int layer) const { return mMinR[layer]; }
   float getMaxR(int layer) const { return mMaxR[layer]; }
   float getMSangle(int layer) const { return mMSangles[layer]; }
   std::vector<float>& getMSangles() { return mMSangles; }
   float getPhiCut(int layer) const { return mPhiCuts[layer]; }
+  std::vector<float>& getPhiCuts() { return mPhiCuts; }
   float getPositionResolution(int layer) const { return mPositionResolution[layer]; }
+  std::vector<float>& getPositionResolutions() { return mPositionResolution; }
 
   gsl::span<Cluster> getClustersOnLayer(int rofId, int layerId);
   gsl::span<const Cluster> getClustersOnLayer(int rofId, int layerId) const;