diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h
index 20632a2c50371..57be54acbbbef 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h
@@ -183,7 +183,8 @@ int filterCellNeighboursHandler(std::vector<int>&,
 void processNeighboursHandler(const int iteration,
                               const int startLayer,
                               const int startLevel,
-                              CellSeed** currentCellSeeds,
+                              CellSeed** allCellSeeds,
+                              CellSeed* currentCellSeeds,
                               const unsigned int nCurrentCells,
                               // const int* currentCellIds,
                               // const unsigned int nCurrentCellsIds,
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
index d46e81421879f..3e89605e592be 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
@@ -91,13 +91,12 @@ template <int nLayers>
 void TrackerTraitsGPU<nLayers>::computeTrackletsHybrid(const int iteration, int iROFslice, int iVertex)
 {
   auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance();
-  // TrackerTraits::computeLayerTracklets(iteration, iROFslice, iVertex);
   mTimeFrameGPU->createTrackletsLUTDevice(iteration);
 
   const Vertex diamondVert({mTrkParams[iteration].Diamond[0], mTrkParams[iteration].Diamond[1], mTrkParams[iteration].Diamond[2]}, {25.e-6f, 0.f, 0.f, 25.e-6f, 0.f, 36.f}, 1, 1.f);
   gsl::span<const Vertex> diamondSpan(&diamondVert, 1);
   int startROF{mTrkParams[iteration].nROFsPerIterations > 0 ? iROFslice * mTrkParams[iteration].nROFsPerIterations : 0};
-  int endROF{mTrkParams[iteration].nROFsPerIterations > 0 ? (iROFslice + 1) * mTrkParams[iteration].nROFsPerIterations + mTrkParams[iteration].DeltaROF : mTimeFrameGPU->getNrof()};
+  int endROF{o2::gpu::CAMath::Min(mTrkParams[iteration].nROFsPerIterations > 0 ? (iROFslice + 1) * mTrkParams[iteration].nROFsPerIterations + mTrkParams[iteration].DeltaROF : mTimeFrameGPU->getNrof(), mTimeFrameGPU->getNrof())};
 
   countTrackletsInROFsHandler<nLayers>(mTimeFrameGPU->getDeviceIndexTableUtils(),
                                        mTimeFrameGPU->getDeviceMultCutMask(),
@@ -284,6 +283,7 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
                                startLayer,
                                startLevel,
                                mTimeFrameGPU->getDeviceArrayCells(),
+                               mTimeFrameGPU->getDeviceCells()[startLayer],
                                mTimeFrameGPU->getNCells()[startLayer],
                                mTimeFrameGPU->getDeviceArrayUsedClusters(),
                                mTimeFrameGPU->getDeviceNeighbours(startLayer - 1),
@@ -384,8 +384,8 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
       }
       mTimeFrameGPU->getTracks(std::min(rofs[0], rofs[1])).emplace_back(track);
     }
+    mTimeFrameGPU->loadUsedClustersDevice();
   }
-  mTimeFrameGPU->loadUsedClustersDevice();
   if (iteration == mTrkParams.size() - 1) {
     mTimeFrameGPU->unregisterHostMemory(0);
   }
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
index 500eff04826cb..32a433cc2ffce 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
@@ -192,9 +192,6 @@ GPUd() o2::track::TrackParCov buildTrackSeed(const Cluster& cluster1,
                              0.f, 0.f, 0.f, 0.f, sg2q2pt});
 }
 
-// auto sort_tracklets = [] GPUhdni()(const Tracklet& a, const Tracklet& b) { return a.firstClusterIndex < b.firstClusterIndex || (a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex < b.secondClusterIndex); };
-// auto equal_tracklets = [] GPUhdni()(const Tracklet& a, const Tracklet& b) { return a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex == b.secondClusterIndex; };
-
 struct sort_tracklets {
   GPUhd() bool operator()(const Tracklet& a, const Tracklet& b) { return a.firstClusterIndex < b.firstClusterIndex || (a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex < b.secondClusterIndex); }
 };
@@ -584,14 +581,13 @@ GPUg() void compileTrackletsLookupTableKernel(const Tracklet* tracklets,
   }
 }
 
-template <bool dryRun, int nLayers = 7>
-GPUg() void processNeighboursKernel(const int iteration,
-                                    const int layer,
+template <bool dryRun, bool debug = false, int nLayers = 7>
+GPUg() void processNeighboursKernel(const int layer,
                                     const int level,
-                                    CellSeed** currentCellSeeds,
-                                    const unsigned int nCurrentCells,
+                                    CellSeed** allCellSeeds,
+                                    CellSeed* currentCellSeeds,
                                     const int* currentCellIds,
-                                    // const unsigned int nCurrentCellsIds,
+                                    const unsigned int nCurrentCells,
                                     CellSeed* updatedCellSeeds,
                                     int* updatedCellsIds,
                                     int* foundSeedsTable,               // auxiliary only in GPU code to compute the number of cells per iteration
@@ -609,7 +605,12 @@ GPUg() void processNeighboursKernel(const int iteration,
   constexpr float layerxX0[7] = {5.e-3f, 5.e-3f, 5.e-3f, 1.e-2f, 1.e-2f, 1.e-2f, 1.e-2f}; // Hardcoded here for the moment.
   for (unsigned int iCurrentCell = blockIdx.x * blockDim.x + threadIdx.x; iCurrentCell < nCurrentCells; iCurrentCell += blockDim.x * gridDim.x) {
     int foundSeeds{0};
-    const auto& currentCell{currentCellSeeds[layer][iCurrentCell]};
+    const auto& currentCell{currentCellSeeds[iCurrentCell]};
+    if constexpr (debug) {
+      if (threadIdx.x == 0 && currentCellIds != nullptr) {
+        currentCellSeeds[iCurrentCell].printCell();
+      }
+    }
     if (currentCell.getLevel() != level) {
       continue;
     }
@@ -619,12 +620,17 @@ GPUg() void processNeighboursKernel(const int iteration,
       continue;
     }
     const int cellId = currentCellIds == nullptr ? iCurrentCell : currentCellIds[iCurrentCell];
+    // if constexpr (debug) {
+    //   if (threadIdx.x == 0 && currentCellIds != nullptr) {
+    //     printf("-> ic: %d cellId: %d \n", iCurrentCell, cellId);
+    //   }
+    // }
     const int startNeighbourId{cellId ? neighboursLUT[cellId - 1] : 0};
     const int endNeighbourId{neighboursLUT[cellId]};
 
     for (int iNeighbourCell{startNeighbourId}; iNeighbourCell < endNeighbourId; ++iNeighbourCell) {
       const int neighbourCellId = neighbours[iNeighbourCell];
-      const CellSeed& neighbourCell = currentCellSeeds[layer - 1][neighbourCellId];
+      const CellSeed& neighbourCell = allCellSeeds[layer - 1][neighbourCellId];
       if (neighbourCell.getSecondTrackletIndex() != currentCell.getFirstTrackletIndex()) {
         continue;
       }
@@ -684,6 +690,7 @@ GPUd() void pPointer(T* ptr)
 {
   printf("[%p]\t", ptr);
 }
+
 template <typename... Args>
 GPUg() void printPointersKernel(std::tuple<Args...> args)
 {
@@ -791,6 +798,15 @@ GPUg() void printTrackletsLUTPerROF(const int layerId,
     }
   }
 }
+
+GPUg() void printCellSeeds(CellSeed* seed, int nCells, const unsigned int tId = 0)
+{
+  for (unsigned int iCell{0}; iCell < nCells; ++iCell) {
+    if (threadIdx.x == tId) {
+      seed[iCell].printCell();
+    }
+  }
+}
 } // namespace gpu
 
 template <int nLayers>
@@ -1163,7 +1179,8 @@ int filterCellNeighboursHandler(std::vector<int>& neighHost, // TODO: eventually
 void processNeighboursHandler(const int iteration,
                               const int startLayer,
                               const int startLevel,
-                              CellSeed** currentCellSeeds,
+                              CellSeed** allCellSeeds,
+                              CellSeed* currentCellSeeds,
                               const unsigned int nCurrentCells,
                               // const int* currentCellIds,
                               // const unsigned int nCurrentCellsIds,
@@ -1181,12 +1198,12 @@ void processNeighboursHandler(const int iteration,
                               const int nThreads)
 {
   thrust::device_vector<int> foundSeedsTable(nCurrentCells + 1); // Shortcut: device_vector skips central memory management, we are relying on the contingency. TODO: fix this.
-  gpu::processNeighboursKernel<true><<<nBlocks, nThreads>>>(iteration,
-                                                            startLayer,
+  gpu::processNeighboursKernel<true><<<nBlocks, nThreads>>>(startLayer,
                                                             startLevel,
+                                                            allCellSeeds,
                                                             currentCellSeeds,
-                                                            nCurrentCells,
                                                             nullptr,                                       // currentCellIds,
+                                                            nCurrentCells,
                                                             nullptr,                                       // updatedCellSeeds,
                                                             nullptr,                                       // updatedCellsIds,
                                                             thrust::raw_pointer_cast(&foundSeedsTable[0]), // auxiliary only in GPU code to compute the number of cells per iteration
@@ -1217,30 +1234,88 @@ void processNeighboursHandler(const int iteration,
   thrust::device_vector<int> updatedCellIds(foundSeedsTable.back()), lastCellIds(foundSeedsTable.back());
   thrust::device_vector<CellSeed> updatedCellSeeds(foundSeedsTable.back()), lastCellSeeds(foundSeedsTable.back());
 
-  gpu::processNeighboursKernel<false><<<1, 1>>>(iteration,
-                                                startLayer,
-                                                startLevel,
-                                                currentCellSeeds,
-                                                nCurrentCells,
-                                                nullptr,                                       // currentCellIds,
-                                                thrust::raw_pointer_cast(&updatedCellSeeds[0]), //  updatedCellSeeds
-                                                thrust::raw_pointer_cast(&updatedCellIds[0]),  // updatedCellsIds
-                                                thrust::raw_pointer_cast(&foundSeedsTable[0]), // auxiliary only in GPU code to compute the number of cells per iteration
-                                                usedClusters,                                  // Used clusters
-                                                neighbours,
-                                                neighboursLUT,
-                                                foundTrackingFrameInfo,
-                                                bz,
-                                                MaxChi2ClusterAttachment,
-                                                propagator,
-                                                matCorrType);
+  gpu::processNeighboursKernel<false><<<nBlocks, nThreads>>>(startLayer,
+                                                             startLevel,
+                                                             allCellSeeds,
+                                                             currentCellSeeds,
+                                                             nullptr,                                        // currentCellIds,
+                                                             nCurrentCells,
+                                                             thrust::raw_pointer_cast(&updatedCellSeeds[0]), // updatedCellSeeds
+                                                             thrust::raw_pointer_cast(&updatedCellIds[0]),   // updatedCellsIds
+                                                             thrust::raw_pointer_cast(&foundSeedsTable[0]),  // auxiliary only in GPU code to compute the number of cells per iteration
+                                                             usedClusters,                                   // Used clusters
+                                                             neighbours,
+                                                             neighboursLUT,
+                                                             foundTrackingFrameInfo,
+                                                             bz,
+                                                             MaxChi2ClusterAttachment,
+                                                             propagator,
+                                                             matCorrType);
+  LOGP(info, "1: updatedCellIds {} - updatedCellSeeds {}", updatedCellIds.size(), updatedCellSeeds.size());
+  // gpu::printCellSeeds<<<1, 1>>>(thrust::raw_pointer_cast(&updatedCellSeeds[0]), foundSeedsTable.back());
+  // gpu::printBufferLayerOnThread<<<1, 1>>>(0, thrust::raw_pointer_cast(&updatedCellIds[0]), updatedCellIds.size());
   int level = startLevel;
   for (int iLayer{startLayer - 1}; iLayer > 0 && level > 2; --iLayer) {
-    lastCellSeeds.swap(updatedCellSeeds);
-    lastCellIds.swap(updatedCellIds);
-    // std::vector<CellSeed>().swap(updatedCellSeeds); /// tame the memory peaks
-    // updatedCellId.clear();
-    // processNeighbours(iLayer, --level, lastCellSeed, lastCellId, updatedCellSeed, updatedCellId);
+    LOGP(info, "iLayer: {} level {}", iLayer, level);
+    --level;
+    // lastCellSeeds.swap(updatedCellSeeds);
+    // lastCellIds.swap(updatedCellIds);
+    // // gpu::printCellSeeds<<<1, 1>>>(thrust::raw_pointer_cast(&lastCellSeeds[0]), lastCellSeeds.size());
+    // foundSeedsTable.resize(lastCellSeeds.size() + 1);
+    // thrust::fill(foundSeedsTable.begin(), foundSeedsTable.end(), 0);
+    // gpu::processNeighboursKernel<true, true><<<1, 1>>>(iLayer,
+    //                                                    level,
+    //                                                    allCellSeeds,
+    //                                                    thrust::raw_pointer_cast(&lastCellSeeds[0]),
+    //                                                    thrust::raw_pointer_cast(&lastCellIds[0]),     // currentCellIds,
+    //                                                    lastCellSeeds.size(),
+    //                                                    nullptr,                                       // updatedCellSeeds,
+    //                                                    nullptr,                                       // updatedCellsIds,
+    //                                                    thrust::raw_pointer_cast(&foundSeedsTable[0]), // auxiliary only in GPU code to compute the number of cells per iteration
+    //                                                    usedClusters,                                  // Used clusters
+    //                                                    neighbours,
+    //                                                    neighboursLUT,
+    //                                                    foundTrackingFrameInfo,
+    //                                                    bz,
+    //                                                    MaxChi2ClusterAttachment,
+    //                                                    propagator,
+    //                                                    matCorrType);
+    // gpu::printBufferLayerOnThread<<<1, 1>>>(0, thrust::raw_pointer_cast(&foundSeedsTable[0]), foundSeedsTable.size());
+    // gpuCheckError(cudaPeekAtLastError());
+    // gpuCheckError(cudaDeviceSynchronize());
+    // gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage_2,                              // d_temp_storage
+    //                                             temp_storage_bytes,                            // temp_storage_bytes
+    //                                             thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_in
+    //                                             thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_out
+    //                                             lastCellSeeds.size() + 1,                      // num_items
+    //                                             0));
+    // discardResult(cudaMalloc(&d_temp_storage, temp_storage_bytes));
+    // gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage_2,                              // d_temp_storage
+    //                                             temp_storage_bytes_2,                          // temp_storage_bytes
+    //                                             thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_in
+    //                                             thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_out
+    //                                             lastCellSeeds.size() + 1,                      // num_items
+    //                                             0));
+    // updatedCellIds.resize(foundSeedsTable.back(), 0);
+    // updatedCellSeeds.resize(foundSeedsTable.back(), CellSeed());
+    // gpu::processNeighboursKernel<false><<<nBlocks, nThreads>>>(iLayer,
+    //                                                            level,
+    //                                                            allCellSeeds,
+    //                                                            thrust::raw_pointer_cast(&lastCellSeeds[0]),
+    //                                                            thrust::raw_pointer_cast(&lastCellIds[0]),      // currentCellIds,
+    //                                                            lastCellSeeds.size(),
+    //                                                            thrust::raw_pointer_cast(&updatedCellSeeds[0]), // updatedCellSeeds
+    //                                                            thrust::raw_pointer_cast(&updatedCellIds[0]),   // updatedCellsIds
+    //                                                            thrust::raw_pointer_cast(&foundSeedsTable[0]),  // auxiliary only in GPU code to compute the number of cells per iteration
+    //                                                            usedClusters,                                   // Used clusters
+    //                                                            neighbours,
+    //                                                            neighboursLUT,
+    //                                                            foundTrackingFrameInfo,
+    //                                                            bz,
+    //                                                            MaxChi2ClusterAttachment,
+    //                                                            propagator,
+    //                                                            matCorrType);
+    // LOGP(info, " -> 2: updatedCellIds {} - updatedCellSeeds {}", updatedCellIds.size(), updatedCellSeeds.size());
   }
 
   gpuCheckError(cudaFree(d_temp_storage));
diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Cell.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Cell.h
index cb9f28665cf07..257e57dbd3fda 100644
--- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Cell.h
+++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Cell.h
@@ -15,7 +15,7 @@
 
 #ifndef TRACKINGITSU_INCLUDE_CACELL_H_
 #define TRACKINGITSU_INCLUDE_CACELL_H_
-
+#include <iostream>
 #ifndef GPUCA_GPUCODE_DEVICE
 #include <array>
 #include <vector>
@@ -104,6 +104,7 @@ class CellSeed final : public o2::track::TrackParCovF
   GPUhd() int* getLevelPtr() { return &mLevel; }
   GPUhd() int* getClusters() { return mClusters; }
   GPUhd() int getCluster(int i) const { return mClusters[i]; }
+  GPUhdi() void printCell();
 
  private:
   int mClusters[7] = {-1, -1, -1, -1, -1, -1, -1};
@@ -112,6 +113,11 @@ class CellSeed final : public o2::track::TrackParCovF
   float mChi2 = 0.f;
 };
 
+GPUhdi() void CellSeed::printCell()
+{
+  printf("trkl: %d, %d\t lvl: %d\t chi2: %f\n", mTracklets[0], mTracklets[1], mLevel, mChi2);
+}
+
 } // namespace its
 } // namespace o2
 #endif /* TRACKINGITSU_INCLUDE_CACELL_H_ */
diff --git a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx
index 35498711a6677..0461f9e3c43e9 100644
--- a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx
+++ b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx
@@ -486,7 +486,7 @@ void TrackerTraits::processNeighbours(int iLayer,
   int failed[5]{0, 0, 0, 0, 0}, attempts{0}, failedByMismatch{0};
 #endif
 
-#pragma omp parallel for num_threads(mNThreads)
+  // #pragma omp parallel for num_threads(mNThreads)
   for (unsigned int iCell = 0; iCell < currentCellSeed.size(); ++iCell) {
     const CellSeed& currentCell{currentCellSeed[iCell]};
     if (currentCell.getLevel() != iLevel) {
@@ -498,6 +498,9 @@ void TrackerTraits::processNeighbours(int iLayer,
       continue; /// this we do only on the first iteration, hence the check on currentCellId
     }
     const int cellId = currentCellId.empty() ? iCell : currentCellId[iCell];
+    // if (!currentCellId.empty()) {
+    //   printf("-> ic: %d cellId: %d \n", iCell, cellId);
+    // }
     const int startNeighbourId{cellId ? mTimeFrame->getCellsNeighboursLUT()[iLayer - 1][cellId - 1] : 0};
     const int endNeighbourId{mTimeFrame->getCellsNeighboursLUT()[iLayer - 1][cellId]};
 
@@ -583,14 +586,26 @@ void TrackerTraits::findRoads(const int iteration)
       std::vector<CellSeed> lastCellSeed, updatedCellSeed;
 
       processNeighbours(startLayer, startLevel, mTimeFrame->getCells()[startLayer], lastCellId, updatedCellSeed, updatedCellId);
-
+      LOGP(info, "1: updatedCellIds {} - updatedCellSeeds {}", updatedCellId.size(), updatedCellSeed.size());
+      // for (auto& c : updatedCellSeed) {
+      //   c.printCell();
+      // }
+      // for (int i{0}; i < updatedCellId.size(); ++i) {
+      //   if (!(i % 150)) {
+      //     printf("\n layer %d: ===> %d/%d\t", 0, i, (int)updatedCellId.size());
+      //   }
+      //   printf("%d\t", updatedCellId[i]);
+      // }
+      // printf("\n");
       int level = startLevel;
       for (int iLayer{startLayer - 1}; iLayer > 0 && level > 2; --iLayer) {
+        LOGP(info, "iLayer: {} level {}", iLayer, level);
         lastCellSeed.swap(updatedCellSeed);
         lastCellId.swap(updatedCellId);
         std::vector<CellSeed>().swap(updatedCellSeed); /// tame the memory peaks
         updatedCellId.clear();
         processNeighbours(iLayer, --level, lastCellSeed, lastCellId, updatedCellSeed, updatedCellId);
+        // LOGP(info, " -> 2: updatedCellIds {} - updatedCellSeeds {}", updatedCellId.size(), updatedCellSeed.size());
       }
       for (auto& seed : updatedCellSeed) {
         if (seed.getQ2Pt() > 1.e3 || seed.getChi2() > mTrkParams[0].MaxChi2NDF * ((startLevel + 2) * 2 - 5)) {