CP

mconcas · Dec 5, 2024 · f8824df · f8824df
1 parent afaf9fc
commit f8824df
Show file tree

Hide file tree

Showing 7 changed files with 306 additions and 85 deletions.
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h
@@ -76,9 +76,10 @@ class TimeFrameGPU : public TimeFrame
   void createCellsBuffers(const int);
   void createCellsDevice();
   void createCellsLUTDevice();
-  void createNeighboursDevice();
+  void createNeighboursIndexTablesDevice();
   void createNeighboursDevice(const unsigned int& layer, std::vector<std::pair<int, int>>& neighbours);
   void createNeighboursLUTDevice(const int, const unsigned int);
+  void createNeighboursDeviceArray();
   void createTrackITSExtDevice(std::vector<CellSeed>&);
   void downloadTrackITSExtDevice(std::vector<CellSeed>&);
   void downloadCellsNeighboursDevice(std::vector<std::vector<std::pair<int, int>>>&, const int);
@@ -113,7 +114,9 @@ class TimeFrameGPU : public TimeFrame
   Road<nLayers - 2>* getDeviceRoads() { return mRoadsDevice; }
   TrackITSExt* getDeviceTrackITSExt() { return mTrackITSExtDevice; }
   int* getDeviceNeighboursLUT(const int layer) { return mNeighboursLUTDevice[layer]; }
-  gpuPair<int, int>* getDeviceNeighbours(const int layer) { return mNeighboursDevice[layer]; }
+  gpuPair<int, int>* getDeviceNeighbourPairs(const int layer) { return mNeighbourPairsDevice[layer]; }
+  int* getDeviceNeighbours(const int layer) { return mNeighboursDevice[layer]; }
+  int** getDeviceNeighboursArray() { return mNeighboursDeviceArray; }
   TrackingFrameInfo* getDeviceTrackingFrameInfo(const int);
   const TrackingFrameInfo** getDeviceArrayTrackingFrameInfo() const { return mTrackingFrameInfoDeviceArray; }
   const Cluster** getDeviceArrayClusters() const { return mClustersDeviceArray; }
@@ -195,7 +198,9 @@ class TimeFrameGPU : public TimeFrame
 
   Road<nLayers - 2>* mRoadsDevice;
   TrackITSExt* mTrackITSExtDevice;
-  std::array<gpuPair<int, int>*, nLayers - 2> mNeighboursDevice;
+  std::array<gpuPair<int, int>*, nLayers - 2> mNeighbourPairsDevice;
+  std::array<int*, nLayers - 2> mNeighboursDevice;
+  int** mNeighboursDeviceArray;
   std::array<TrackingFrameInfo*, nLayers> mTrackingFrameInfoDevice;
   const TrackingFrameInfo** mTrackingFrameInfoDeviceArray;
 

diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h
@@ -175,9 +175,30 @@ void computeCellNeighboursHandler(CellSeed** cellsLayersDevice,
                                   const int nBlocks,
                                   const int nThreads);
 
-void filterCellNeighboursHandler(std::vector<int>&,
-                                 gpuPair<int, int>*,
-                                 unsigned int);
+int filterCellNeighboursHandler(std::vector<int>&,
+                                gpuPair<int, int>*,
+                                int*,
+                                unsigned int);
+
+void processNeighboursHandler(const int iteration,
+                              const int startLayer,
+                              const int startLevel,
+                              CellSeed** currentCellSeeds,
+                              const unsigned int nCurrentCells,
+                              // const int* currentCellIds,
+                              // const unsigned int nCurrentCellsIds,
+                              // CellSeed* updatedCellSeeds,
+                              // int* updatedCellsIds,
+                              const unsigned char** usedClusters, // Used clusters
+                              int* neighbours,
+                              int* neighboursLUT,
+                              const TrackingFrameInfo** foundTrackingFrameInfo,
+                              const float bz,
+                              const float MaxChi2ClusterAttachment,
+                              const o2::base::Propagator* propagator,
+                              const o2::base::PropagatorF::MatCorrType matCorrType,
+                              const int nBlocks,
+                              const int nThreads);
 
 void trackSeedHandler(CellSeed* trackSeeds,
                       const TrackingFrameInfo** foundTrackingFrameInfo,

diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt
@@ -13,7 +13,7 @@
 if(CUDA_ENABLED)
 find_package(CUDAToolkit)
 message(STATUS "Building ITS CUDA tracker")
-# add_compile_options(-O0 -g -lineinfo -fPIC)
+add_compile_options(-O0 -g -lineinfo -fPIC)
 # add_compile_definitions(ITS_MEASURE_GPU_TIME)
 o2_add_library(ITStrackingCUDA
                SOURCES ClusterLinesGPU.cu

diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
@@ -306,17 +306,28 @@ void TimeFrameGPU<nLayers>::loadTrackletsLUTDevice()
 }
 
 template <int nLayers>
-void TimeFrameGPU<nLayers>::createNeighboursDevice()
+void TimeFrameGPU<nLayers>::createNeighboursIndexTablesDevice()
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading cell seeds");
+  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells neighbours");
+  // Here we do also the creation of the CellsDeviceArray, as the cells buffers are populated separately in the previous steps.
+  allocMemAsync(reinterpret_cast<void**>(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), nullptr, getExtAllocator());
+  checkGPUError(cudaHostRegister(mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaHostRegisterPortable));
+  checkGPUError(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
   for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) {
     LOGP(debug, "gpu-transfer: loading neighbours LUT for {} elements on layer {}, for {} MB.", mNCells[iLayer], iLayer, mNCells[iLayer] * sizeof(CellSeed) / MB);
     allocMemAsync(reinterpret_cast<void**>(&mNeighboursIndexTablesDevice[iLayer]), (mNCells[iLayer] + 1) * sizeof(int), nullptr, getExtAllocator());
     checkGPUError(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (mNCells[iLayer] + 1) * sizeof(int), mGpuStreams[0].get()));
   }
-  allocMemAsync(reinterpret_cast<void**>(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), nullptr, getExtAllocator());
-  checkGPUError(cudaHostRegister(mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaHostRegisterPortable));
-  checkGPUError(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
+}
+
+template <int nLayers>
+void TimeFrameGPU<nLayers>::createNeighboursLUTDevice(const int layer, const unsigned int nCells)
+{
+  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighboursLUT");
+  LOGP(debug, "gpu-allocation: reserving neighbours LUT for {} elements on layer {} , for {} MB.", nCells + 1, layer, (nCells + 1) * sizeof(int) / MB);
+  allocMemAsync(reinterpret_cast<void**>(&mNeighboursLUTDevice[layer]), (nCells + 1) * sizeof(int), nullptr, getExtAllocator()); // We need one element more to move exc -> inc
+  checkGPUError(cudaMemsetAsync(mNeighboursLUTDevice[layer], 0, (nCells + 1) * sizeof(int), mGpuStreams[0].get()));
   STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
@@ -400,19 +411,20 @@ void TimeFrameGPU<nLayers>::createNeighboursDevice(const unsigned int& layer, st
   START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighbours");
   mCellsNeighbours[layer].clear();
   mCellsNeighbours[layer].resize(neighbours.size());
+  LOGP(debug, "gpu-allocation: reserving {} neighbours (pairs), for {} MB.", neighbours.size(), neighbours.size() * sizeof(gpuPair<int, int>) / MB);
+  allocMemAsync(reinterpret_cast<void**>(&mNeighbourPairsDevice[layer]), neighbours.size() * sizeof(gpuPair<int, int>), &(mGpuStreams[0]), getExtAllocator());
+  checkGPUError(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, neighbours.size() * sizeof(gpuPair<int, int>), mGpuStreams[0].get()));
   LOGP(debug, "gpu-allocation: reserving {} neighbours, for {} MB.", neighbours.size(), neighbours.size() * sizeof(gpuPair<int, int>) / MB);
-  allocMemAsync(reinterpret_cast<void**>(&mNeighboursDevice[layer]), neighbours.size() * sizeof(gpuPair<int, int>), &(mGpuStreams[0]), getExtAllocator());
-  checkGPUError(cudaMemsetAsync(mNeighboursDevice[layer], -1, neighbours.size() * sizeof(gpuPair<int, int>), mGpuStreams[0].get()));
+  allocMemAsync(reinterpret_cast<void**>(&mNeighboursDevice[layer]), neighbours.size() * sizeof(int), &(mGpuStreams[0]), getExtAllocator());
   STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
 template <int nLayers>
-void TimeFrameGPU<nLayers>::createNeighboursLUTDevice(const int layer, const unsigned int nCells)
+void TimeFrameGPU<nLayers>::createNeighboursDeviceArray()
 {
-  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighboursLUT");
-  LOGP(debug, "gpu-allocation: reserving neighbours LUT for {} elements on layer {} , for {} MB.", nCells + 1, layer, (nCells + 1) * sizeof(int) / MB);
-  allocMemAsync(reinterpret_cast<void**>(&mNeighboursLUTDevice[layer]), (nCells + 1) * sizeof(int), nullptr, getExtAllocator()); // We need one element more to move exc -> inc
-  checkGPUError(cudaMemsetAsync(mNeighboursLUTDevice[layer], 0, (nCells + 1) * sizeof(int), mGpuStreams[0].get()));
+  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighbours");
+  allocMemAsync(reinterpret_cast<void**>(&mNeighboursDeviceArray), (nLayers - 2) * sizeof(int*), &(mGpuStreams[0]), getExtAllocator());
+  checkGPUError(cudaMemcpyAsync(mNeighboursDeviceArray, mNeighboursDevice.data(), (nLayers - 2) * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
   STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
@@ -459,7 +471,7 @@ void TimeFrameGPU<nLayers>::downloadCellsNeighboursDevice(std::vector<std::vecto
   START_GPU_STREAM_TIMER(mGpuStreams[0].get(), fmt::format("downloading neighbours from layer {}", layer));
   LOGP(debug, "gpu-transfer: downloading {} neighbours, for {} MB.", neighbours[layer].size(), neighbours[layer].size() * sizeof(std::pair<int, int>) / MB);
   // TODO: something less dangerous than assuming the same memory layout of std::pair and gpuPair... or not? :)
-  checkGPUError(cudaMemcpyAsync(neighbours[layer].data(), mNeighboursDevice[layer], neighbours[layer].size() * sizeof(gpuPair<int, int>), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
+  checkGPUError(cudaMemcpyAsync(neighbours[layer].data(), mNeighbourPairsDevice[layer], neighbours[layer].size() * sizeof(gpuPair<int, int>), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
 }
 
 template <int nLayers>

diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
@@ -214,7 +214,7 @@ void TrackerTraitsGPU<nLayers>::computeCellsHybrid(const int iteration)
 template <int nLayers>
 void TrackerTraitsGPU<nLayers>::findCellsNeighboursHybrid(const int iteration)
 {
-  mTimeFrameGPU->createNeighboursDevice();
+  mTimeFrameGPU->createNeighboursIndexTablesDevice();
   auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance();
   std::vector<std::vector<std::pair<int, int>>> cellsNeighboursLayer(mTrkParams[iteration].CellsPerRoad() - 1);
   for (int iLayer{0}; iLayer < mTrkParams[iteration].CellsPerRoad() - 1; ++iLayer) {
@@ -228,17 +228,16 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighboursHybrid(const int iteration)
       continue;
     }
 
-    int layerCellsNum{static_cast<int>(mTimeFrameGPU->getCells()[iLayer].size())};
     mTimeFrameGPU->createNeighboursLUTDevice(iLayer, nextLayerCellsNum);
     countCellNeighboursHandler(mTimeFrameGPU->getDeviceArrayCells(),
                                mTimeFrameGPU->getDeviceNeighboursLUT(iLayer), // LUT is initialised here.
                                mTimeFrameGPU->getDeviceArrayCellsLUT(),
-                               mTimeFrameGPU->getDeviceNeighbours(iLayer),
+                               mTimeFrameGPU->getDeviceNeighbourPairs(iLayer),
                                mTimeFrameGPU->getDeviceNeighboursIndexTables(iLayer),
                                mTrkParams[0].MaxChi2ClusterAttachment,
                                mBz,
                                iLayer,
-                               layerCellsNum,
+                               mTimeFrameGPU->getNCells()[iLayer],
                                nextLayerCellsNum,
                                1e2,
                                conf.nBlocks,
@@ -250,12 +249,12 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighboursHybrid(const int iteration)
     computeCellNeighboursHandler(mTimeFrameGPU->getDeviceArrayCells(),
                                  mTimeFrameGPU->getDeviceNeighboursLUT(iLayer),
                                  mTimeFrameGPU->getDeviceArrayCellsLUT(),
-                                 mTimeFrameGPU->getDeviceNeighbours(iLayer),
+                                 mTimeFrameGPU->getDeviceNeighbourPairs(iLayer),
                                  mTimeFrameGPU->getDeviceNeighboursIndexTables(iLayer),
                                  mTrkParams[0].MaxChi2ClusterAttachment,
                                  mBz,
                                  iLayer,
-                                 layerCellsNum,
+                                 mTimeFrameGPU->getNCells()[iLayer],
                                  nextLayerCellsNum,
                                  1e2,
                                  conf.nBlocks,
@@ -264,20 +263,38 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighboursHybrid(const int iteration)
     mTimeFrameGPU->getCellsNeighbours()[iLayer].reserve(cellsNeighboursLayer[iLayer].size());
 
     filterCellNeighboursHandler(mTimeFrameGPU->getCellsNeighbours()[iLayer],
+                                mTimeFrameGPU->getDeviceNeighbourPairs(iLayer),
                                 mTimeFrameGPU->getDeviceNeighbours(iLayer),
                                 cellsNeighboursLayer[iLayer].size());
   }
+  mTimeFrameGPU->createNeighboursDeviceArray();
   mTimeFrameGPU->downloadCellsDevice();
   mTimeFrameGPU->unregisterRest();
 };
 
 template <int nLayers>
 void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
 {
+  auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance();
   for (int startLevel{mTrkParams[iteration].CellsPerRoad()}; startLevel >= mTrkParams[iteration].CellMinimumLevel(); --startLevel) {
     const int minimumLayer{startLevel - 1};
     std::vector<CellSeed> trackSeeds;
     for (int startLayer{mTrkParams[iteration].CellsPerRoad() - 1}; startLayer >= minimumLayer; --startLayer) {
+      processNeighboursHandler(iteration,
+                               startLayer,
+                               startLevel,
+                               mTimeFrameGPU->getDeviceArrayCells(),
+                               mTimeFrameGPU->getNCells()[startLayer],
+                               mTimeFrameGPU->getDeviceArrayUsedClusters(),
+                               mTimeFrameGPU->getDeviceNeighbours(startLayer - 1),
+                               mTimeFrameGPU->getDeviceNeighboursLUT(startLayer - 1),
+                               mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(),
+                               mBz,
+                               mTrkParams[0].MaxChi2ClusterAttachment, // float maxChi2ClusterAttachment
+                               mTimeFrameGPU->getDevicePropagator(),
+                               mCorrType,
+                               conf.nBlocks,
+                               conf.nThreads);
       std::vector<int> lastCellId, updatedCellId;
       std::vector<CellSeed> lastCellSeed, updatedCellSeed;
 
@@ -304,15 +321,15 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
     }
     mTimeFrameGPU->createTrackITSExtDevice(trackSeeds);
     mTimeFrameGPU->loadTrackSeedsDevice(trackSeeds);
-    auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance();
-    trackSeedHandler(mTimeFrameGPU->getDeviceTrackSeeds(),             // CellSeed* trackSeeds,
-                     mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(), // TrackingFrameInfo** foundTrackingFrameInfo,
-                     mTimeFrameGPU->getDeviceTrackITSExt(),            // o2::its::TrackITSExt* tracks,
-                     trackSeeds.size(),                                // const size_t nSeeds,
-                     mBz,                                              // const float Bz,
+
+    trackSeedHandler(mTimeFrameGPU->getDeviceTrackSeeds(),             // CellSeed* trackSeeds
+                     mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(), // TrackingFrameInfo** foundTrackingFrameInfo
+                     mTimeFrameGPU->getDeviceTrackITSExt(),            // o2::its::TrackITSExt* tracks
+                     trackSeeds.size(),                                // const size_t nSeeds
+                     mBz,                                              // const float Bz
                      startLevel,                                       // const int startLevel,
-                     mTrkParams[0].MaxChi2ClusterAttachment,           // float maxChi2ClusterAttachment,
-                     mTrkParams[0].MaxChi2NDF,                         // float maxChi2NDF,
+                     mTrkParams[0].MaxChi2ClusterAttachment,           // float maxChi2ClusterAttachment
+                     mTrkParams[0].MaxChi2NDF,                         // float maxChi2NDF
                      mTimeFrameGPU->getDevicePropagator(),             // const o2::base::Propagator* propagator
                      mCorrType,                                        // o2::base::PropagatorImpl<float>::MatCorrType
                      conf.nBlocks,