From 4880ea5c764f3fa66063a131467f89ecb2956273 Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Thu, 31 Oct 2024 16:53:56 +0100 Subject: [PATCH] Move cellFinder on a pure GPU-based implementation --- .../TrackParametrizationWithError.h | 4 +- Detectors/ITSMFT/ITS/tracking/CMakeLists.txt | 1 - .../GPU/ITStrackingGPU/TimeFrameGPU.h | 43 +- .../GPU/ITStrackingGPU/TrackingKernels.h | 35 ++ .../ITS/tracking/GPU/cuda/CMakeLists.txt | 3 +- .../ITS/tracking/GPU/cuda/TimeFrameGPU.cu | 314 +++++++------- .../tracking/GPU/cuda/TrackerTraitsGPU.cxx | 89 +++- .../ITS/tracking/GPU/cuda/TrackingKernels.cu | 410 ++++++++++++------ .../ITS/tracking/include/ITStracking/Cell.h | 1 + .../ITSMFT/ITS/tracking/src/TrackerTraits.cxx | 3 + 10 files changed, 588 insertions(+), 315 deletions(-) diff --git a/DataFormats/Reconstruction/include/ReconstructionDataFormats/TrackParametrizationWithError.h b/DataFormats/Reconstruction/include/ReconstructionDataFormats/TrackParametrizationWithError.h index 015b5d37e258c..dd155e7f55569 100644 --- a/DataFormats/Reconstruction/include/ReconstructionDataFormats/TrackParametrizationWithError.h +++ b/DataFormats/Reconstruction/include/ReconstructionDataFormats/TrackParametrizationWithError.h @@ -42,7 +42,7 @@ class TrackParametrizationWithError : public TrackParametrization using MatrixDSym5 = o2::math_utils::SMatrix>; using MatrixD5 = o2::math_utils::SMatrix>; - GPUd() TrackParametrizationWithError(); + GPUhd() TrackParametrizationWithError(); GPUd() TrackParametrizationWithError(value_t x, value_t alpha, const params_t& par, const covMat_t& cov, int charge = 1, const PID pid = PID::Pion); GPUd() TrackParametrizationWithError(const dim3_t& xyz, const dim3_t& pxpypz, const gpu::gpustd::array& cv, int sign, bool sectorAlpha = true, const PID pid = PID::Pion); @@ -145,7 +145,7 @@ class TrackParametrizationWithError : public TrackParametrization //__________________________________________________________________________ template -GPUdi() TrackParametrizationWithError::TrackParametrizationWithError() : TrackParametrization{} +GPUhdi() TrackParametrizationWithError::TrackParametrizationWithError() : TrackParametrization{} { } diff --git a/Detectors/ITSMFT/ITS/tracking/CMakeLists.txt b/Detectors/ITSMFT/ITS/tracking/CMakeLists.txt index d3667294d6c61..f8c71e27d0058 100644 --- a/Detectors/ITSMFT/ITS/tracking/CMakeLists.txt +++ b/Detectors/ITSMFT/ITS/tracking/CMakeLists.txt @@ -37,7 +37,6 @@ o2_add_library(ITStracking O2::ITSMFTReconstruction O2::DataFormatsITS) - if (OpenMP_CXX_FOUND) target_compile_definitions(${targetName} PRIVATE WITH_OPENMP) target_link_libraries(${targetName} PRIVATE OpenMP::OpenMP_CXX) diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h index db1bfd836e8e6..ad8724f315ec8 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h @@ -52,22 +52,28 @@ class TimeFrameGPU : public TimeFrame void initDevice(IndexTableUtils*, const TrackingParameters& trkParam, const TimeFrameGPUParameters&, const int, const int); void initDeviceSAFitting(); void loadTrackingFrameInfoDevice(const int); - void loadUnsortedClustersDevice(); - void loadClustersDevice(); + void loadUnsortedClustersDevice(const int); + void loadClustersDevice(const int); void loadTrackletsDevice(); + void loadTrackletsLUTDevice(); void loadCellsDevice(); - void loadCellsLUT(); + void loadCellsLUTDevice(); void loadTrackSeedsDevice(); void loadTrackSeedsChi2Device(); void loadRoadsDevice(); void loadTrackSeedsDevice(std::vector&); + void createCellsBuffers(const int); + void createCellsDevice(); + void createCellsLUTDevice(); + void createNeighboursDevice(); void createNeighboursDevice(const unsigned int& layer, std::vector>& neighbours); void createNeighboursLUTDevice(const int, const unsigned int); void createTrackITSExtDevice(std::vector&); void downloadTrackITSExtDevice(std::vector&); - void downloadCellsNeighbours(std::vector>>&, const int); - void downloadNeighboursLUT(std::vector&, const int); - void downloadCellsDevice(const int); + void downloadCellsNeighboursDevice(std::vector>>&, const int); + void downloadNeighboursLUTDevice(std::vector&, const int); + void downloadCellsDevice(); + void downloadCellsLUTDevice(); void unregisterRest(); void initDeviceChunks(const int, const int); template @@ -98,11 +104,11 @@ class TimeFrameGPU : public TimeFrame int* getDeviceNeighboursLUT(const int layer) { return mNeighboursLUTDevice[layer]; } gpuPair* getDeviceNeighbours(const int layer) { return mNeighboursDevice[layer]; } TrackingFrameInfo* getDeviceTrackingFrameInfo(const int); - // TrackingFrameInfo** getDeviceArrayTrackingFrameInfo() { return mTrackingFrameInfoDeviceArray; } const TrackingFrameInfo** getDeviceArrayTrackingFrameInfo() const { return mTrackingFrameInfoDeviceArray; } - Cluster** getDeviceArrayClusters() const { return mClustersDeviceArray; } - Cluster** getDeviceArrayUnsortedClusters() const { return mUnsortedClustersDeviceArray; } - Tracklet** getDeviceArrayTracklets() const { return mTrackletsDeviceArray; } + const Cluster** getDeviceArrayClusters() const { return mClustersDeviceArray; } + const Cluster** getDeviceArrayUnsortedClusters() const { return mUnsortedClustersDeviceArray; } + const Tracklet** getDeviceArrayTracklets() const { return mTrackletsDeviceArray; } + const int** getDeviceArrayTrackletsLUT() const { return mTrackletsLUTDeviceArray; } int** getDeviceArrayCellsLUT() const { return mCellsLUTDeviceArray; } int** getDeviceArrayNeighboursCellLUT() const { return mNeighboursCellLUTDeviceArray; } CellSeed** getDeviceArrayCells() const { return mCellsDeviceArray; } @@ -117,6 +123,11 @@ class TimeFrameGPU : public TimeFrame gsl::span getHostNTracklets(const int chunkId); gsl::span getHostNCells(const int chunkId); + // Host-available device getters + gsl::span getDeviceCellLUTs() { return mCellsLUTDevice; } + gsl::span getDeviceCells() { return mCellsDevice; } + gsl::span getNCellsDevice() { return mNCells; } + private: void allocMemAsync(void**, size_t, Stream*, bool); // Abstract owned and unowned memory allocations bool mHostRegistered = false; @@ -124,6 +135,9 @@ class TimeFrameGPU : public TimeFrame TimeFrameGPUParameters mGpuParams; StaticTrackingParameters mStaticTrackingParams; + // Host-available device buffer sizes + std::array mNCells; + // Device pointers StaticTrackingParameters* mTrackingParamsDevice; IndexTableUtils* mIndexTableUtilsDevice; @@ -135,12 +149,15 @@ class TimeFrameGPU : public TimeFrame // Hybrid pref std::array mClustersDevice; std::array mUnsortedClustersDevice; - Cluster** mClustersDeviceArray; - Cluster** mUnsortedClustersDeviceArray; + const Cluster** mClustersDeviceArray; + const Cluster** mUnsortedClustersDeviceArray; std::array mTrackletsDevice; - Tracklet** mTrackletsDeviceArray; + const Tracklet** mTrackletsDeviceArray; + const int** mTrackletsLUTDeviceArray; + std::array mTrackletsLUTDevice; std::array mCellsLUTDevice; std::array mNeighboursLUTDevice; + int** mCellsLUTDeviceArray; int** mNeighboursCellDeviceArray; int** mNeighboursCellLUTDeviceArray; diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h index 167baa905f790..34e6165b9530f 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h @@ -49,6 +49,41 @@ GPUg() void fitTrackSeedsKernel( const o2::base::PropagatorF::MatCorrType matCorrType = o2::base::PropagatorF::MatCorrType::USEMatCorrLUT); #endif } // namespace gpu + +void countCellsHandler(const Cluster** sortedClusters, + const Cluster** unsortedClusters, + const TrackingFrameInfo** tfInfo, + const Tracklet** tracklets, + const int** trackletsLUT, + const int nTracklets, + const int layer, + CellSeed* cells, + int** cellsLUTsDeviceArray, + int* cellsLUTsHost, + const float bz, + const float maxChi2ClusterAttachment, + const float cellDeltaTanLambdaSigma, + const float nSigmaCut, + const int nBlocks, + const int nThreads); + +void computeCellsHandler(const Cluster** sortedClusters, + const Cluster** unsortedClusters, + const TrackingFrameInfo** tfInfo, + const Tracklet** tracklets, + const int** trackletsLUT, + const int nTracklets, + const int layer, + CellSeed* cells, + int** cellsLUTsDeviceArray, + int* cellsLUTsHost, + const float bz, + const float maxChi2ClusterAttachment, + const float cellDeltaTanLambdaSigma, + const float nSigmaCut, + const int nBlocks, + const int nThreads); + void countCellNeighboursHandler(CellSeed** cellsLayersDevice, int* neighboursLUTs, int** cellsLUTs, diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt index c8e1d0a910e5b..e2fc1f1388ad0 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt @@ -13,7 +13,8 @@ if(CUDA_ENABLED) find_package(CUDAToolkit) message(STATUS "Building ITS CUDA tracker") -# add_compile_options(-O0 -g -lineinfo -fPIC) +add_compile_options(-O0 -g -lineinfo -fPIC) +# add_compile_definitions(ITS_MEASURE_GPU_TIME) o2_add_library(ITStrackingCUDA SOURCES ClusterLinesGPU.cu Context.cu diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu index c9c6792b5417b..67144ba2c98ea 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu @@ -21,15 +21,31 @@ #include #include +#include #include "GPUCommonDef.h" #include "GPUCommonMath.h" #include "GPUCommonLogger.h" -#ifndef __HIPCC__ -#define THRUST_NAMESPACE thrust::cuda +#ifdef ITS_MEASURE_GPU_TIME +#define START_GPU_STREAM_TIMER(stream, name) \ + cudaEvent_t event_start, event_stop; \ + checkGPUError(cudaEventCreate(&event_start)); \ + checkGPUError(cudaEventCreate(&event_stop)); \ + checkGPUError(cudaEventRecord(event_start, stream)); \ + const std::string task_name = name; + +#define STOP_GPU_STREAM_TIMER(stream) \ + checkGPUError(cudaEventRecord(event_stop, stream)); \ + checkGPUError(cudaEventSynchronize(event_stop)); \ + float ms; \ + checkGPUError(cudaEventElapsedTime(&ms, event_start, event_stop)); \ + std::cout << "Elapsed time for " << task_name << ": " << ms << " ms" << std::endl; \ + checkGPUError(cudaEventDestroy(event_start)); \ + checkGPUError(cudaEventDestroy(event_stop)); #else -#define THRUST_NAMESPACE thrust::hip +#define START_GPU_STREAM_TIMER(stream, name) +#define STOP_GPU_STREAM_TIMER(stream) #endif namespace o2 @@ -65,7 +81,7 @@ void TimeFrameGPU::allocMemAsync(void** ptr, size_t size, Stream* strPt if (extAllocator) { *ptr = mAllocator->allocate(size); } else { - LOGP(info, "Calling default CUDA allocator"); + LOGP(debug, "Calling default CUDA allocator"); checkGPUError(cudaMallocAsync(reinterpret_cast(ptr), size, strPtr->get())); } } @@ -77,43 +93,49 @@ void TimeFrameGPU::setDevicePropagator(const o2::base::PropagatorImpl -void TimeFrameGPU::loadUnsortedClustersDevice() +void TimeFrameGPU::loadUnsortedClustersDevice(const int iteration) { - for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { - LOGP(debug, "gpu-transfer: loading {} unsorted clusters on layer {}, for {} MB.", mUnsortedClusters[iLayer].size(), iLayer, mUnsortedClusters[iLayer].size() * sizeof(Cluster) / MB); - allocMemAsync(reinterpret_cast(&mUnsortedClustersDevice[iLayer]), mUnsortedClusters[iLayer].size() * sizeof(Cluster), nullptr, getExtAllocator()); - // Register and move data - checkGPUError(cudaHostRegister(mUnsortedClusters[iLayer].data(), mUnsortedClusters[iLayer].size() * sizeof(Cluster), cudaHostRegisterPortable)); - checkGPUError(cudaMemcpyAsync(mUnsortedClustersDevice[iLayer], mUnsortedClusters[iLayer].data(), mUnsortedClusters[iLayer].size() * sizeof(Cluster), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + if (!iteration) { + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading unsorted clusters"); + for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { + LOGP(debug, "gpu-transfer: loading {} unsorted clusters on layer {}, for {} MB.", mUnsortedClusters[iLayer].size(), iLayer, mUnsortedClusters[iLayer].size() * sizeof(Cluster) / MB); + allocMemAsync(reinterpret_cast(&mUnsortedClustersDevice[iLayer]), mUnsortedClusters[iLayer].size() * sizeof(Cluster), nullptr, getExtAllocator()); + checkGPUError(cudaHostRegister(mUnsortedClusters[iLayer].data(), mUnsortedClusters[iLayer].size() * sizeof(Cluster), cudaHostRegisterPortable)); + checkGPUError(cudaMemcpyAsync(mUnsortedClustersDevice[iLayer], mUnsortedClusters[iLayer].data(), mUnsortedClusters[iLayer].size() * sizeof(Cluster), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + } + allocMemAsync(reinterpret_cast(&mUnsortedClustersDeviceArray), nLayers * sizeof(Cluster*), nullptr, getExtAllocator()); + checkGPUError(cudaHostRegister(mUnsortedClustersDevice.data(), nLayers * sizeof(Cluster*), cudaHostRegisterPortable)); + checkGPUError(cudaMemcpyAsync(mUnsortedClustersDeviceArray, mUnsortedClustersDevice.data(), nLayers * sizeof(Cluster*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } - allocMemAsync(reinterpret_cast(&mUnsortedClustersDeviceArray), nLayers * sizeof(Cluster*), nullptr, getExtAllocator()); - checkGPUError(cudaHostRegister(mUnsortedClustersDevice.data(), nLayers * sizeof(Cluster*), cudaHostRegisterPortable)); - checkGPUError(cudaMemcpyAsync(mUnsortedClustersDeviceArray, mUnsortedClustersDevice.data(), nLayers * sizeof(Cluster*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } template -void TimeFrameGPU::loadClustersDevice() +void TimeFrameGPU::loadClustersDevice(const int iteration) { - for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { - LOGP(debug, "gpu-transfer: loading {} clusters on layer {}, for {} MB.", mClusters[iLayer].size(), iLayer, mClusters[iLayer].size() * sizeof(Cluster) / MB); - allocMemAsync(reinterpret_cast(&mClustersDevice[iLayer]), mClusters[iLayer].size() * sizeof(Cluster), nullptr, getExtAllocator()); - // Register and move data - checkGPUError(cudaHostRegister(mClusters[iLayer].data(), mClusters[iLayer].size() * sizeof(Cluster), cudaHostRegisterPortable)); - checkGPUError(cudaMemcpyAsync(mClustersDevice[iLayer], mClusters[iLayer].data(), mClusters[iLayer].size() * sizeof(Cluster), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + if (!iteration) { + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading sorted clusters"); + for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { + LOGP(debug, "gpu-transfer: loading {} clusters on layer {}, for {} MB.", mClusters[iLayer].size(), iLayer, mClusters[iLayer].size() * sizeof(Cluster) / MB); + allocMemAsync(reinterpret_cast(&mClustersDevice[iLayer]), mClusters[iLayer].size() * sizeof(Cluster), nullptr, getExtAllocator()); + checkGPUError(cudaHostRegister(mClusters[iLayer].data(), mClusters[iLayer].size() * sizeof(Cluster), cudaHostRegisterPortable)); + checkGPUError(cudaMemcpyAsync(mClustersDevice[iLayer], mClusters[iLayer].data(), mClusters[iLayer].size() * sizeof(Cluster), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + } + allocMemAsync(reinterpret_cast(&mClustersDeviceArray), nLayers * sizeof(Cluster*), nullptr, getExtAllocator()); + checkGPUError(cudaHostRegister(mClustersDevice.data(), nLayers * sizeof(Cluster*), cudaHostRegisterPortable)); + checkGPUError(cudaMemcpyAsync(mClustersDeviceArray, mClustersDevice.data(), nLayers * sizeof(Cluster*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } - allocMemAsync(reinterpret_cast(&mClustersDeviceArray), nLayers * sizeof(Cluster*), nullptr, getExtAllocator()); - checkGPUError(cudaHostRegister(mClustersDevice.data(), nLayers * sizeof(Cluster*), cudaHostRegisterPortable)); - checkGPUError(cudaMemcpyAsync(mClustersDeviceArray, mClustersDevice.data(), nLayers * sizeof(Cluster*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } template void TimeFrameGPU::loadTrackingFrameInfoDevice(const int iteration) { + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading trackingframeinfo"); if (!iteration) { for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { LOGP(debug, "gpu-transfer: loading {} tfinfo on layer {}, for {} MB.", mTrackingFrameInfo[iLayer].size(), iLayer, mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo) / MB); allocMemAsync(reinterpret_cast(&mTrackingFrameInfoDevice[iLayer]), mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), nullptr, getExtAllocator()); - // Register and move data checkGPUError(cudaHostRegister(mTrackingFrameInfo[iLayer].data(), mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), cudaHostRegisterPortable)); checkGPUError(cudaMemcpyAsync(mTrackingFrameInfoDevice[iLayer], mTrackingFrameInfo[iLayer].data(), mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } @@ -121,53 +143,108 @@ void TimeFrameGPU::loadTrackingFrameInfoDevice(const int iteration) checkGPUError(cudaHostRegister(mTrackingFrameInfoDevice.data(), nLayers * sizeof(TrackingFrameInfo*), cudaHostRegisterPortable)); checkGPUError(cudaMemcpyAsync(mTrackingFrameInfoDeviceArray, mTrackingFrameInfoDevice.data(), nLayers * sizeof(TrackingFrameInfo*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::loadTrackletsDevice() { + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading tracklets"); for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) { LOGP(debug, "gpu-transfer: loading {} tracklets on layer {}, for {} MB.", mTracklets[iLayer].size(), iLayer, mTracklets[iLayer].size() * sizeof(Tracklet) / MB); allocMemAsync(reinterpret_cast(&mTrackletsDevice[iLayer]), mTracklets[iLayer].size() * sizeof(Tracklet), nullptr, getExtAllocator()); - // Register and move data checkGPUError(cudaHostRegister(mTracklets[iLayer].data(), mTracklets[iLayer].size() * sizeof(Tracklet), cudaHostRegisterPortable)); checkGPUError(cudaMemcpyAsync(mTrackletsDevice[iLayer], mTracklets[iLayer].data(), mTracklets[iLayer].size() * sizeof(Tracklet), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } allocMemAsync(reinterpret_cast(&mTrackletsDeviceArray), (nLayers - 1) * sizeof(Tracklet*), nullptr, getExtAllocator()); checkGPUError(cudaHostRegister(mTrackletsDevice.data(), (nLayers - 1) * sizeof(Tracklet*), cudaHostRegisterPortable)); checkGPUError(cudaMemcpyAsync(mTrackletsDeviceArray, mTrackletsDevice.data(), (nLayers - 1) * sizeof(Tracklet*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); +} + +template +void TimeFrameGPU::loadTrackletsLUTDevice() +{ + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading tracklets"); + for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) { + LOGP(debug, "gpu-transfer: loading tracklets LUT for {} elements on layer {}, for {} MB", mTrackletsLookupTable[iLayer].size(), iLayer, mTrackletsLookupTable[iLayer].size() * sizeof(int) / MB); + allocMemAsync(reinterpret_cast(&mTrackletsLUTDevice[iLayer]), mTrackletsLookupTable[iLayer].size() * sizeof(int), nullptr, getExtAllocator()); + checkGPUError(cudaHostRegister(mTrackletsLookupTable[iLayer].data(), mTrackletsLookupTable[iLayer].size() * sizeof(int), cudaHostRegisterPortable)); + checkGPUError(cudaMemcpyAsync(mTrackletsLUTDevice[iLayer], mTrackletsLookupTable[iLayer].data(), mTrackletsLookupTable[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice)); + } + allocMemAsync(reinterpret_cast(&mTrackletsLUTDeviceArray), (nLayers - 2) * sizeof(int*), nullptr, getExtAllocator()); + checkGPUError(cudaHostRegister(mTrackletsLUTDevice.data(), (nLayers - 2) * sizeof(int*), cudaHostRegisterPortable)); + checkGPUError(cudaMemcpyAsync(mTrackletsLUTDeviceArray, mTrackletsLUTDevice.data(), (nLayers - 2) * sizeof(int*), cudaMemcpyHostToDevice)); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); +} + +template +void TimeFrameGPU::createNeighboursDevice() +{ + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading cell seeds"); + for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) { + LOGP(debug, "gpu-transfer: loading neighbours LUT for {} elements on layer {}, for {} MB.", mNCells[iLayer], iLayer, mNCells[iLayer] * sizeof(CellSeed) / MB); + allocMemAsync(reinterpret_cast(&mNeighboursIndexTablesDevice[iLayer]), (mNCells[iLayer] + 1) * sizeof(int), nullptr, getExtAllocator()); + checkGPUError(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (mNCells[iLayer] + 1) * sizeof(int), mGpuStreams[0].get())); + } + allocMemAsync(reinterpret_cast(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), nullptr, getExtAllocator()); + checkGPUError(cudaHostRegister(mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaHostRegisterPortable)); + checkGPUError(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::loadCellsDevice() { + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading cell seeds"); for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) { LOGP(debug, "gpu-transfer: loading {} cell seeds on layer {}, for {} MB.", mCells[iLayer].size(), iLayer, mCells[iLayer].size() * sizeof(CellSeed) / MB); allocMemAsync(reinterpret_cast(&mCellsDevice[iLayer]), mCells[iLayer].size() * sizeof(CellSeed), nullptr, getExtAllocator()); allocMemAsync(reinterpret_cast(&mNeighboursIndexTablesDevice[iLayer]), (mCells[iLayer].size() + 1) * sizeof(int), nullptr, getExtAllocator()); // accessory for the neigh. finding. checkGPUError(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (mCells[iLayer].size() + 1) * sizeof(int), mGpuStreams[0].get())); - // Register and move data - checkGPUError(cudaHostRegister(mCells[iLayer].data(), mCells[iLayer].size() * sizeof(CellSeed), cudaHostRegisterPortable)); checkGPUError(cudaMemcpyAsync(mCellsDevice[iLayer], mCells[iLayer].data(), mCells[iLayer].size() * sizeof(CellSeed), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } allocMemAsync(reinterpret_cast(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), nullptr, getExtAllocator()); - checkGPUError(cudaHostRegister(mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaHostRegisterPortable)); checkGPUError(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template -void TimeFrameGPU::loadCellsLUT() +void TimeFrameGPU::createCellsLUTDevice() { - for (auto iLayer{0}; iLayer < nLayers - 3; ++iLayer) { - LOGP(debug, "gpu-transfer: loading {} cell LUTs on layer {}, for {} MB.", mCellsLookupTable[iLayer].size(), iLayer, mCellsLookupTable[iLayer].size() * sizeof(int) / MB); - allocMemAsync(reinterpret_cast(&(mCellsLUTDevice[iLayer])), sizeof(int) * mCellsLookupTable[iLayer].size(), nullptr, getExtAllocator()); - // Register and move data - checkGPUError(cudaHostRegister(mCellsLookupTable[iLayer].data(), mCellsLookupTable[iLayer].size() * sizeof(int), cudaHostRegisterPortable)); - checkGPUError(cudaMemcpyAsync(mCellsLUTDevice[iLayer], mCellsLookupTable[iLayer].data(), mCellsLookupTable[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells LUTs"); + for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) { + LOGP(debug, "gpu-transfer: creating cell LUT for {} elements on layer {}, for {} MB.", mTracklets[iLayer].size() + 1, iLayer, (mTracklets[iLayer].size() + 1) * sizeof(int) / MB); + allocMemAsync(reinterpret_cast(&mCellsLUTDevice[iLayer]), (mTracklets[iLayer].size() + 1) * sizeof(int), nullptr, getExtAllocator()); + checkGPUError(cudaMemsetAsync(mCellsLUTDevice[iLayer], 0, (mTracklets[iLayer].size() + 1) * sizeof(int), mGpuStreams[0].get())); } allocMemAsync(reinterpret_cast(&mCellsLUTDeviceArray), (nLayers - 2) * sizeof(int*), nullptr, getExtAllocator()); - checkGPUError(cudaHostRegister(mCellsLUTDevice.data(), mCellsLUTDevice.size() * sizeof(int*), cudaHostRegisterPortable)); checkGPUError(cudaMemcpyAsync(mCellsLUTDeviceArray, mCellsLUTDevice.data(), mCellsLUTDevice.size() * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); +} + +template +void TimeFrameGPU::createCellsBuffers(const int layer) +{ + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells buffers"); + mNCells[layer] = 0; + checkGPUError(cudaMemcpyAsync(&mNCells[layer], mCellsLUTDevice[layer] + mTracklets[layer].size(), sizeof(int), cudaMemcpyDeviceToHost)); + LOGP(debug, "gpu-transfer: creating cell buffer for {} elements on layer {}, for {} MB.", mNCells[layer], layer, mNCells[layer] * sizeof(CellSeed) / MB); + allocMemAsync(reinterpret_cast(&mCellsDevice[layer]), mNCells[layer] * sizeof(CellSeed), nullptr, getExtAllocator()); + + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); +} + +template +void TimeFrameGPU::loadCellsLUTDevice() +{ + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading cells LUTs"); + for (auto iLayer{0}; iLayer < nLayers - 3; ++iLayer) { + LOGP(debug, "gpu-transfer: loading cell LUT for {} elements on layer {}, for {} MB.", mCellsLookupTable[iLayer].size(), iLayer, mCellsLookupTable[iLayer].size() * sizeof(int) / MB); + checkGPUError(cudaHostRegister(mCellsLookupTable[iLayer].data(), mCellsLookupTable[iLayer].size() * sizeof(int), cudaHostRegisterPortable)); + checkGPUError(cudaMemcpyAsync(mCellsLUTDevice[iLayer + 1], mCellsLookupTable[iLayer].data(), mCellsLookupTable[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + } + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template @@ -182,111 +259,130 @@ void TimeFrameGPU::loadRoadsDevice() template void TimeFrameGPU::loadTrackSeedsDevice(std::vector& seeds) { + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading track seeds"); LOGP(debug, "gpu-transfer: loading {} track seeds, for {} MB.", seeds.size(), seeds.size() * sizeof(CellSeed) / MB); allocMemAsync(reinterpret_cast(&mTrackSeedsDevice), seeds.size() * sizeof(CellSeed), &(mGpuStreams[0]), getExtAllocator()); checkGPUError(cudaHostRegister(seeds.data(), seeds.size() * sizeof(CellSeed), cudaHostRegisterPortable)); checkGPUError(cudaMemcpyAsync(mTrackSeedsDevice, seeds.data(), seeds.size() * sizeof(CellSeed), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::createNeighboursDevice(const unsigned int& layer, std::vector>& neighbours) { + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighbours"); mCellsNeighbours[layer].clear(); mCellsNeighbours[layer].resize(neighbours.size()); LOGP(debug, "gpu-allocation: reserving {} neighbours, for {} MB.", neighbours.size(), neighbours.size() * sizeof(gpuPair) / MB); allocMemAsync(reinterpret_cast(&mNeighboursDevice[layer]), neighbours.size() * sizeof(gpuPair), &(mGpuStreams[0]), getExtAllocator()); checkGPUError(cudaMemsetAsync(mNeighboursDevice[layer], -1, neighbours.size() * sizeof(gpuPair), mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::createNeighboursLUTDevice(const int layer, const unsigned int nCells) { - LOGP(debug, "gpu-allocation: reserving {} slots for neighbours LUT, for {} MB.", nCells + 1, (nCells + 1) * sizeof(int) / MB); + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighboursLUT"); + LOGP(debug, "gpu-allocation: reserving neighbours LUT for {} elements on layer {} , for {} MB.", nCells + 1, layer, (nCells + 1) * sizeof(int) / MB); allocMemAsync(reinterpret_cast(&mNeighboursLUTDevice[layer]), (nCells + 1) * sizeof(int), nullptr, getExtAllocator()); // We need one element more to move exc -> inc checkGPUError(cudaMemsetAsync(mNeighboursLUTDevice[layer], 0, (nCells + 1) * sizeof(int), mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::createTrackITSExtDevice(std::vector& seeds) { + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving tracks"); mTrackITSExt.clear(); mTrackITSExt.resize(seeds.size()); LOGP(debug, "gpu-allocation: reserving {} tracks, for {} MB.", seeds.size(), seeds.size() * sizeof(o2::its::TrackITSExt) / MB); allocMemAsync(reinterpret_cast(&mTrackITSExtDevice), seeds.size() * sizeof(o2::its::TrackITSExt), &(mGpuStreams[0]), getExtAllocator()); checkGPUError(cudaMemsetAsync(mTrackITSExtDevice, 0, seeds.size() * sizeof(o2::its::TrackITSExt), mGpuStreams[0].get())); checkGPUError(cudaHostRegister(mTrackITSExt.data(), seeds.size() * sizeof(o2::its::TrackITSExt), cudaHostRegisterPortable)); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template -void TimeFrameGPU::downloadCellsDevice(const int layer) +void TimeFrameGPU::downloadCellsDevice() { - LOGP(debug, "gpu-transfer: downloading {} cells on layer: {}, for {} MB.", mCells[layer].size(), layer, mCells[layer].size() * sizeof(CellSeed) / MB); - checkGPUError(cudaMemcpyAsync(mCells[layer].data(), mCellsDevice[layer], mCells[layer].size() * sizeof(CellSeed), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); - checkGPUError(cudaHostUnregister(mCells[layer].data())); + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "downloading cells"); + for (int iLayer{0}; iLayer < nLayers - 2; ++iLayer) { + LOGP(debug, "gpu-transfer: downloading {} cells on layer: {}, for {} MB.", mNCells[iLayer], iLayer, mNCells[iLayer] * sizeof(CellSeed) / MB); + mCells[iLayer].resize(mNCells[iLayer]); + checkGPUError(cudaMemcpyAsync(mCells[iLayer].data(), mCellsDevice[iLayer], mNCells[iLayer] * sizeof(CellSeed), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); + } + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); +} + +template +void TimeFrameGPU::downloadCellsLUTDevice() +{ + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "downloading cell luts"); + for (auto iLayer{0}; iLayer < nLayers - 3; ++iLayer) { + LOGP(debug, "gpu-transfer: downloading cells lut on layer {} for {} elements", iLayer, (mTracklets[iLayer + 1].size() + 1)); + mCellsLookupTable[iLayer].resize(mTracklets[iLayer + 1].size() + 1); + checkGPUError(cudaMemcpyAsync(mCellsLookupTable[iLayer].data(), mCellsLUTDevice[iLayer + 1], (mTracklets[iLayer + 1].size() + 1) * sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); + } + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template -void TimeFrameGPU::downloadCellsNeighbours(std::vector>>& neighbours, const int layer) +void TimeFrameGPU::downloadCellsNeighboursDevice(std::vector>>& neighbours, const int layer) { + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), fmt::format("downloading neighbours from layer {}", layer)); LOGP(debug, "gpu-transfer: downloading {} neighbours, for {} MB.", neighbours[layer].size(), neighbours[layer].size() * sizeof(std::pair) / MB); - // TOOD: something less dangerous than assuming the same memory layout of std::pair and gpuPair... or not? :) + // TODO: something less dangerous than assuming the same memory layout of std::pair and gpuPair... or not? :) checkGPUError(cudaMemcpyAsync(neighbours[layer].data(), mNeighboursDevice[layer], neighbours[layer].size() * sizeof(gpuPair), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); } template -void TimeFrameGPU::downloadNeighboursLUT(std::vector& lut, const int layer) +void TimeFrameGPU::downloadNeighboursLUTDevice(std::vector& lut, const int layer) { - LOGP(debug, "gpu-transfer: downloading {} neighbours lut, for {} MB.", lut.size(), lut.size() * sizeof(int) / MB); + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), fmt::format("downloading neighbours LUT from layer {}", layer)); + LOGP(debug, "gpu-transfer: downloading neighbours LUT for {} elements on layer {}, for {} MB.", lut.size(), layer, lut.size() * sizeof(int) / MB); checkGPUError(cudaMemcpyAsync(lut.data(), mNeighboursLUTDevice[layer], lut.size() * sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::downloadTrackITSExtDevice(std::vector& seeds) { + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "downloading tracks"); LOGP(debug, "gpu-transfer: downloading {} tracks, for {} MB.", mTrackITSExt.size(), mTrackITSExt.size() * sizeof(o2::its::TrackITSExt) / MB); checkGPUError(cudaMemcpyAsync(mTrackITSExt.data(), mTrackITSExtDevice, seeds.size() * sizeof(o2::its::TrackITSExt), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); checkGPUError(cudaHostUnregister(mTrackITSExt.data())); checkGPUError(cudaHostUnregister(seeds.data())); - // discardResult(cudaDeviceSynchronize()); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::unregisterRest() { + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "unregistering rest of the host memory"); LOGP(debug, "unregistering rest of the host memory..."); - checkGPUError(cudaHostUnregister(mCells[0].data())); checkGPUError(cudaHostUnregister(mCellsDevice.data())); - checkGPUError(cudaHostUnregister(mCellsLUTDevice.data())); - for (auto iLayer{0}; iLayer < nLayers - 3; ++iLayer) { - checkGPUError(cudaHostUnregister(mCellsLookupTable[iLayer].data())); - } -} -//////////////////////////////////////////////////////////////////////// -/// Legacy -template -void TimeFrameGPU::registerHostMemory(const int maxLayers) -{ - if (mHostRegistered) { - return; - } else { - mHostRegistered = true; - } - for (auto iLayer{0}; iLayer < maxLayers; ++iLayer) { - checkGPUError(cudaHostRegister(mClusters[iLayer].data(), mClusters[iLayer].size() * sizeof(Cluster), cudaHostRegisterPortable)); - checkGPUError(cudaHostRegister(mNClustersPerROF[iLayer].data(), mNClustersPerROF[iLayer].size() * sizeof(int), cudaHostRegisterPortable)); - checkGPUError(cudaHostRegister(mIndexTables[iLayer].data(), (mStaticTrackingParams.ZBins * mStaticTrackingParams.PhiBins + 1) * mNrof * sizeof(int), cudaHostRegisterPortable)); + checkGPUError(cudaHostUnregister(mTrackletsDevice.data())); + checkGPUError(cudaHostUnregister(mTrackletsLUTDevice.data())); + for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) { + if (iLayer < nLayers - 2) { + checkGPUError(cudaHostUnregister(mTrackletsLookupTable[iLayer].data())); + } + checkGPUError(cudaHostUnregister(mTracklets[iLayer].data())); } - checkGPUError(cudaHostRegister(mHostNTracklets.data(), (nLayers - 1) * mGpuParams.nTimeFrameChunks * sizeof(int), cudaHostRegisterPortable)); - checkGPUError(cudaHostRegister(mHostNCells.data(), (nLayers - 2) * mGpuParams.nTimeFrameChunks * sizeof(int), cudaHostRegisterPortable)); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::unregisterHostMemory(const int maxLayers) { for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { + checkGPUError(cudaHostUnregister(mUnsortedClusters[iLayer].data())); + checkGPUError(cudaHostUnregister(mClusters[iLayer].data())); checkGPUError(cudaHostUnregister(mTrackingFrameInfo[iLayer].data())); } checkGPUError(cudaHostUnregister(mTrackingFrameInfoDevice.data())); + checkGPUError(cudaHostUnregister(mUnsortedClustersDevice.data())); + checkGPUError(cudaHostUnregister(mClustersDevice.data())); } template @@ -300,86 +396,6 @@ void TimeFrameGPU::initialise(const int iteration, o2::its::TimeFrame::initialise(iteration, trkParam, maxLayers); } -template -void TimeFrameGPU::wipe(const int maxLayers) -{ - unregisterHostMemory(maxLayers); -} - -template -void TimeFrameGPU::initDevice(IndexTableUtils* utils, - const TrackingParameters& trkParam, - const TimeFrameGPUParameters& gpuParam, - const int maxLayers, - const int iteration) -{ - // mStaticTrackingParams.ZBins = trkParam.ZBins; - // mStaticTrackingParams.PhiBins = trkParam.PhiBins; - // if (mFirstInit) { - // mGpuParams = gpuParam; - // allocMemAsync(reinterpret_cast(&mTrackingParamsDevice), sizeof(gpu::StaticTrackingParameters), nullptr, true); - // checkGPUError(cudaMemcpy(mTrackingParamsDevice, &mStaticTrackingParams, sizeof(gpu::StaticTrackingParameters), cudaMemcpyHostToDevice)); - // if (utils) { // If utils is not nullptr, then its gpu vertexing - // mIndexTableUtils = *utils; - // allocMemAsync(reinterpret_cast(&mIndexTableUtilsDevice), sizeof(IndexTableUtils), nullptr, true); - // } else { // GPU tracking otherwise - // mIndexTableUtils.setTrackingParameters(trkParam); - // } - - // mMemChunks.resize(mGpuParams.nTimeFrameChunks, GpuTimeFrameChunk{static_cast(this), mGpuParams}); - // mVerticesInChunks.resize(mGpuParams.nTimeFrameChunks); - // mNVerticesInChunks.resize(mGpuParams.nTimeFrameChunks); - // mLabelsInChunks.resize(mGpuParams.nTimeFrameChunks); - // LOGP(info, "Size of fixed part is: {} MB", GpuTimeFrameChunk::computeFixedSizeBytes(mGpuParams) / MB); - // LOGP(info, "Size of scaling part is: {} MB", GpuTimeFrameChunk::computeScalingSizeBytes(GpuTimeFrameChunk::computeRofPerChunk(mGpuParams, mAvailMemGB), mGpuParams) / MB); - // LOGP(info, "Allocating {} chunks of {} rofs capacity each.", mGpuParams.nTimeFrameChunks, mGpuParams.nROFsPerChunk); - - // for (int iChunk{0}; iChunk < mMemChunks.size(); ++iChunk) { - // mMemChunks[iChunk].allocate(GpuTimeFrameChunk::computeRofPerChunk(mGpuParams, mGpuParams.maxGPUMemoryGB), mGpuStreams[iChunk]); - // } - // for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { - // allocMemAsync(reinterpret_cast(&mROframesClustersDevice[iLayer]), mROframesClusters[iLayer].size() * sizeof(int), nullptr, true); - // allocMemAsync(reinterpret_cast(&(mUsedClustersDevice[iLayer])), sizeof(unsigned char) * mGpuParams.clustersPerROfCapacity * mNrof, nullptr, true); - // } - // allocMemAsync(reinterpret_cast(&mVerticesDevice), sizeof(Vertex) * mGpuParams.maxVerticesCapacity, nullptr, true); - // allocMemAsync(reinterpret_cast(&mROframesPVDevice), sizeof(int) * (mNrof + 1), nullptr, true); - - // mFirstInit = false; - // } - // if (maxLayers < nLayers) { // Vertexer - // for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { - // checkGPUError(cudaMemcpy(mROframesClustersDevice[iLayer], mROframesClusters[iLayer].data(), mROframesClusters[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice)); - // } - // } else { // Tracker - // checkGPUError(cudaMemcpy(mVerticesDevice, mPrimaryVertices.data(), sizeof(Vertex) * mPrimaryVertices.size(), cudaMemcpyHostToDevice)); - // checkGPUError(cudaMemcpy(mROframesPVDevice, mROframesPV.data(), sizeof(int) * mROframesPV.size(), cudaMemcpyHostToDevice)); - // if (!iteration) { - // for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { - // checkGPUError(cudaMemset(mUsedClustersDevice[iLayer], 0, sizeof(unsigned char) * mGpuParams.clustersPerROfCapacity * mNrof)); - // } - // } - // } - // checkGPUError(cudaMemcpy(mIndexTableUtilsDevice, &mIndexTableUtils, sizeof(IndexTableUtils), cudaMemcpyHostToDevice)); -} - -template -unsigned char* TimeFrameGPU::getDeviceUsedClusters(const int layer) -{ - return mUsedClustersDevice[layer]; -} - -template -gsl::span TimeFrameGPU::getHostNTracklets(const int chunkId) -{ - return gsl::span(mHostNTracklets.data() + (nLayers - 1) * chunkId, nLayers - 1); -} - -template -gsl::span TimeFrameGPU::getHostNCells(const int chunkId) -{ - return gsl::span(mHostNCells.data() + (nLayers - 2) * chunkId, nLayers - 2); -} - template class TimeFrameGPU<7>; } // namespace gpu } // namespace its diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx index 45fee9976bca6..3c6a307fc4ff6 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx @@ -29,6 +29,8 @@ template void TrackerTraitsGPU::initialiseTimeFrame(const int iteration) { mTimeFrameGPU->initialise(iteration, mTrkParams[iteration], nLayers); + mTimeFrameGPU->loadClustersDevice(iteration); + mTimeFrameGPU->loadUnsortedClustersDevice(iteration); mTimeFrameGPU->loadTrackingFrameInfoDevice(iteration); } @@ -317,18 +319,66 @@ void TrackerTraitsGPU::computeTrackletsHybrid(const int iteration, int template void TrackerTraitsGPU::computeCellsHybrid(const int iteration) { - TrackerTraits::computeLayerCells(iteration); -}; + mTimeFrameGPU->loadTrackletsDevice(); + mTimeFrameGPU->loadTrackletsLUTDevice(); + mTimeFrameGPU->createCellsLUTDevice(); + auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance(); + + // #pragma omp parallel for num_threads(nLayers) + for (int iLayer = 0; iLayer < mTrkParams[iteration].CellsPerRoad(); ++iLayer) { + if (mTimeFrameGPU->getTracklets()[iLayer + 1].empty() || + mTimeFrameGPU->getTracklets()[iLayer].empty()) { + continue; + } + + const int currentLayerTrackletsNum{static_cast(mTimeFrameGPU->getTracklets()[iLayer].size())}; + countCellsHandler(mTimeFrameGPU->getDeviceArrayClusters(), + mTimeFrameGPU->getDeviceArrayUnsortedClusters(), + mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(), + mTimeFrameGPU->getDeviceArrayTracklets(), + mTimeFrameGPU->getDeviceArrayTrackletsLUT(), + mTimeFrameGPU->getTracklets()[iLayer].size(), + iLayer, + nullptr, + mTimeFrameGPU->getDeviceArrayCellsLUT(), + mTimeFrameGPU->getDeviceCellLUTs()[iLayer], + mBz, + mTrkParams[iteration].MaxChi2ClusterAttachment, + mTrkParams[iteration].CellDeltaTanLambdaSigma, + mTrkParams[iteration].NSigmaCut, + conf.nBlocks, + conf.nThreads); + mTimeFrameGPU->createCellsBuffers(iLayer); + computeCellsHandler(mTimeFrameGPU->getDeviceArrayClusters(), + mTimeFrameGPU->getDeviceArrayUnsortedClusters(), + mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(), + mTimeFrameGPU->getDeviceArrayTracklets(), + mTimeFrameGPU->getDeviceArrayTrackletsLUT(), + mTimeFrameGPU->getTracklets()[iLayer].size(), + iLayer, + mTimeFrameGPU->getDeviceCells()[iLayer], + mTimeFrameGPU->getDeviceArrayCellsLUT(), + mTimeFrameGPU->getDeviceCellLUTs()[iLayer], + mBz, + mTrkParams[iteration].MaxChi2ClusterAttachment, + mTrkParams[iteration].CellDeltaTanLambdaSigma, + mTrkParams[iteration].NSigmaCut, + conf.nBlocks, + conf.nThreads); + } + // Needed for processNeighbours() which is still on CPU. + mTimeFrameGPU->downloadCellsDevice(); + mTimeFrameGPU->downloadCellsLUTDevice(); +} template void TrackerTraitsGPU::findCellsNeighboursHybrid(const int iteration) { + mTimeFrameGPU->createNeighboursDevice(); auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance(); - mTimeFrameGPU->loadCellsDevice(); - mTimeFrameGPU->loadCellsLUT(); std::vector>> cellsNeighboursLayer(mTrkParams[iteration].CellsPerRoad() - 1); for (int iLayer{0}; iLayer < mTrkParams[iteration].CellsPerRoad() - 1; ++iLayer) { - const int nextLayerCellsNum{static_cast(mTimeFrameGPU->getCells()[iLayer + 1].size())}; + const int nextLayerCellsNum{static_cast(mTimeFrameGPU->getNCellsDevice()[iLayer + 1])}; mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].clear(); mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].resize(nextLayerCellsNum, 0); @@ -353,7 +403,7 @@ void TrackerTraitsGPU::findCellsNeighboursHybrid(const int iteration) 1e2, conf.nBlocks, conf.nThreads); - mTimeFrameGPU->downloadNeighboursLUT(mTimeFrameGPU->getCellsNeighboursLUT()[iLayer], iLayer); + mTimeFrameGPU->downloadNeighboursLUTDevice(mTimeFrameGPU->getCellsNeighboursLUT()[iLayer], iLayer); // Get the number of found cells from LUT cellsNeighboursLayer[iLayer].resize(mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].back()); mTimeFrameGPU->createNeighboursDevice(iLayer, cellsNeighboursLayer[iLayer]); @@ -372,12 +422,12 @@ void TrackerTraitsGPU::findCellsNeighboursHybrid(const int iteration) conf.nThreads); mTimeFrameGPU->getCellsNeighbours()[iLayer].clear(); mTimeFrameGPU->getCellsNeighbours()[iLayer].reserve(cellsNeighboursLayer[iLayer].size()); - mTimeFrameGPU->downloadCellsDevice(iLayer + 1); // Cells on layer 0 did not change. filterCellNeighboursHandler(mTimeFrameGPU->getCellsNeighbours()[iLayer], mTimeFrameGPU->getDeviceNeighbours(iLayer), cellsNeighboursLayer[iLayer].size()); } + mTimeFrameGPU->downloadCellsDevice(); mTimeFrameGPU->unregisterRest(); }; @@ -415,19 +465,18 @@ void TrackerTraitsGPU::findRoads(const int iteration) mTimeFrameGPU->createTrackITSExtDevice(trackSeeds); mTimeFrameGPU->loadTrackSeedsDevice(trackSeeds); auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance(); - trackSeedHandler( - mTimeFrameGPU->getDeviceTrackSeeds(), // CellSeed* trackSeeds, - mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(), // TrackingFrameInfo** foundTrackingFrameInfo, - mTimeFrameGPU->getDeviceTrackITSExt(), // o2::its::TrackITSExt* tracks, - trackSeeds.size(), // const size_t nSeeds, - mBz, // const float Bz, - startLevel, // const int startLevel, - mTrkParams[0].MaxChi2ClusterAttachment, // float maxChi2ClusterAttachment, - mTrkParams[0].MaxChi2NDF, // float maxChi2NDF, - mTimeFrameGPU->getDevicePropagator(), // const o2::base::Propagator* propagator - mCorrType, // o2::base::PropagatorImpl::MatCorrType - conf.nBlocks, - conf.nThreads); + trackSeedHandler(mTimeFrameGPU->getDeviceTrackSeeds(), // CellSeed* trackSeeds, + mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(), // TrackingFrameInfo** foundTrackingFrameInfo, + mTimeFrameGPU->getDeviceTrackITSExt(), // o2::its::TrackITSExt* tracks, + trackSeeds.size(), // const size_t nSeeds, + mBz, // const float Bz, + startLevel, // const int startLevel, + mTrkParams[0].MaxChi2ClusterAttachment, // float maxChi2ClusterAttachment, + mTrkParams[0].MaxChi2NDF, // float maxChi2NDF, + mTimeFrameGPU->getDevicePropagator(), // const o2::base::Propagator* propagator + mCorrType, // o2::base::PropagatorImpl::MatCorrType + conf.nBlocks, + conf.nThreads); mTimeFrameGPU->downloadTrackITSExtDevice(trackSeeds); diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu index 9d00892f4b680..e31e3f378298b 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu @@ -130,6 +130,72 @@ GPUd() bool fitTrack(TrackITSExt& track, return o2::gpu::GPUCommonMath::Abs(track.getQ2Pt()) < maxQoverPt && track.getChi2() < chi2ndfcut * (nCl * 2 - 5); } +GPUd() o2::track::TrackParCov buildTrackSeed(const Cluster& cluster1, + const Cluster& cluster2, + const TrackingFrameInfo& tf3, + const float bz) +{ + const float ca = o2::gpu::CAMath::Cos(tf3.alphaTrackingFrame), sa = o2::gpu::CAMath::Sin(tf3.alphaTrackingFrame); + const float x1 = cluster1.xCoordinate * ca + cluster1.yCoordinate * sa; + const float y1 = -cluster1.xCoordinate * sa + cluster1.yCoordinate * ca; + const float z1 = cluster1.zCoordinate; + const float x2 = cluster2.xCoordinate * ca + cluster2.yCoordinate * sa; + const float y2 = -cluster2.xCoordinate * sa + cluster2.yCoordinate * ca; + const float z2 = cluster2.zCoordinate; + const float x3 = tf3.xTrackingFrame; + const float y3 = tf3.positionTrackingFrame[0]; + const float z3 = tf3.positionTrackingFrame[1]; + + const bool zeroField{o2::gpu::GPUCommonMath::Abs(bz) < o2::constants::math::Almost0}; + const float tgp = zeroField ? o2::gpu::CAMath::ATan2(y3 - y1, x3 - x1) : 1.f; + const float crv = zeroField ? 1.f : math_utils::computeCurvature(x3, y3, x2, y2, x1, y1); + const float snp = zeroField ? tgp / o2::gpu::CAMath::Sqrt(1.f + tgp * tgp) : crv * (x3 - math_utils::computeCurvatureCentreX(x3, y3, x2, y2, x1, y1)); + const float tgl12 = math_utils::computeTanDipAngle(x1, y1, x2, y2, z1, z2); + const float tgl23 = math_utils::computeTanDipAngle(x2, y2, x3, y3, z2, z3); + const float q2pt = zeroField ? 1.f / o2::track::kMostProbablePt : crv / (bz * o2::constants::math::B2C); + const float q2pt2 = crv * crv; + const float sg2q2pt = o2::track::kC1Pt2max * (q2pt2 > 0.0005 ? (q2pt2 < 1 ? q2pt2 : 1) : 0.0005); + return track::TrackParCov(tf3.xTrackingFrame, tf3.alphaTrackingFrame, + {y3, z3, snp, 0.5f * (tgl12 + tgl23), q2pt}, + {tf3.covarianceTrackingFrame[0], + tf3.covarianceTrackingFrame[1], tf3.covarianceTrackingFrame[2], + 0.f, 0.f, track::kCSnp2max, + 0.f, 0.f, 0.f, track::kCTgl2max, + 0.f, 0.f, 0.f, 0.f, sg2q2pt}); +} + +template +struct pair_to_first : public thrust::unary_function, T1> { + GPUhd() int operator()(const gpuPair& a) const + { + return a.first; + } +}; + +template +struct pair_to_second : public thrust::unary_function, T2> { + GPUhd() int operator()(const gpuPair& a) const + { + return a.second; + } +}; + +template +struct is_invalid_pair { + GPUhd() bool operator()(const gpuPair& p) const + { + return p.first == -1 && p.second == -1; + } +}; + +template +struct is_valid_pair { + GPUhd() bool operator()(const gpuPair& p) const + { + return !(p.first == -1 && p.second == -1); + } +}; + template GPUg() void fitTrackSeedsKernel( CellSeed* trackSeeds, @@ -208,8 +274,8 @@ GPUg() void computeLayerCellNeighboursKernel( for (int iCurrentCellIndex = blockIdx.x * blockDim.x + threadIdx.x; iCurrentCellIndex < nCells; iCurrentCellIndex += blockDim.x * gridDim.x) { const auto& currentCellSeed{cellSeedArray[layerIndex][iCurrentCellIndex]}; const int nextLayerTrackletIndex{currentCellSeed.getSecondTrackletIndex()}; - const int nextLayerFirstCellIndex{cellsLUTs[layerIndex][nextLayerTrackletIndex]}; - const int nextLayerLastCellIndex{cellsLUTs[layerIndex][nextLayerTrackletIndex + 1]}; + const int nextLayerFirstCellIndex{cellsLUTs[layerIndex + 1][nextLayerTrackletIndex]}; + const int nextLayerLastCellIndex{cellsLUTs[layerIndex + 1][nextLayerTrackletIndex + 1]}; int foundNeighbours{0}; for (int iNextCell{nextLayerFirstCellIndex}; iNextCell < nextLayerLastCellIndex; ++iNextCell) { CellSeed nextCellSeed{cellSeedArray[layerIndex + 1][iNextCell]}; // Copy @@ -243,41 +309,94 @@ GPUg() void computeLayerCellNeighboursKernel( } } -template -struct pair_to_first : public thrust::unary_function, T1> { - GPUhd() int operator()(const gpuPair& a) const - { - return a.first; - } -}; +template +GPUg() void computeLayerCellsKernel( + const Cluster** sortedClusters, + const Cluster** unsortedClusters, + const TrackingFrameInfo** tfInfo, + const Tracklet** tracklets, + const int** trackletsLUT, + const int nTrackletsCurrent, + const int layer, + CellSeed* cells, + int** cellsLUTs, + const float bz, + const float maxChi2ClusterAttachment, + const float cellDeltaTanLambdaSigma, + const float nSigmaCut) +{ + constexpr float radl = 9.36f; // Radiation length of Si [cm]. + constexpr float rho = 2.33f; // Density of Si [g/cm^3]. + constexpr float layerxX0[7] = {5.e-3f, 5.e-3f, 5.e-3f, 1.e-2f, 1.e-2f, 1.e-2f, 1.e-2f}; // Hardcoded here for the moment. + for (int iCurrentTrackletIndex = blockIdx.x * blockDim.x + threadIdx.x; iCurrentTrackletIndex < nTrackletsCurrent; iCurrentTrackletIndex += blockDim.x * gridDim.x) { + const Tracklet& currentTracklet = tracklets[layer][iCurrentTrackletIndex]; + const int nextLayerClusterIndex{currentTracklet.secondClusterIndex}; + const int nextLayerFirstTrackletIndex{trackletsLUT[layer][nextLayerClusterIndex]}; + const int nextLayerLastTrackletIndex{trackletsLUT[layer][nextLayerClusterIndex + 1]}; + if (nextLayerFirstTrackletIndex == nextLayerLastTrackletIndex) { + continue; + } + int foundCells{0}; + for (int iNextTrackletIndex{nextLayerFirstTrackletIndex}; iNextTrackletIndex < nextLayerLastTrackletIndex; ++iNextTrackletIndex) { + if (tracklets[layer + 1][iNextTrackletIndex].firstClusterIndex != nextLayerClusterIndex) { + break; + } + const Tracklet& nextTracklet = tracklets[layer + 1][iNextTrackletIndex]; + const float deltaTanLambda{o2::gpu::GPUCommonMath::Abs(currentTracklet.tanLambda - nextTracklet.tanLambda)}; -template -struct pair_to_second : public thrust::unary_function, T2> { - GPUhd() int operator()(const gpuPair& a) const - { - return a.second; - } -}; + if (deltaTanLambda / cellDeltaTanLambdaSigma < nSigmaCut) { + const int clusId[3]{ + sortedClusters[layer][currentTracklet.firstClusterIndex].clusterId, + sortedClusters[layer + 1][nextTracklet.firstClusterIndex].clusterId, + sortedClusters[layer + 2][nextTracklet.secondClusterIndex].clusterId}; -template -struct is_invalid_pair { - GPUhd() bool operator()(const gpuPair& p) const - { - return p.first == -1 && p.second == -1; - } -}; + const auto& cluster1_glo = unsortedClusters[layer][clusId[0]]; + const auto& cluster2_glo = unsortedClusters[layer + 1][clusId[1]]; + const auto& cluster3_tf = tfInfo[layer + 2][clusId[2]]; + auto track{buildTrackSeed(cluster1_glo, cluster2_glo, cluster3_tf, bz)}; + float chi2{0.f}; + bool good{false}; + for (int iC{2}; iC--;) { + const TrackingFrameInfo& trackingHit = tfInfo[layer + iC][clusId[iC]]; + if (!track.rotate(trackingHit.alphaTrackingFrame)) { + break; + } + if (!track.propagateTo(trackingHit.xTrackingFrame, bz)) { + break; + } -template -struct is_valid_pair { - GPUhd() bool operator()(const gpuPair& p) const - { - return !(p.first == -1 && p.second == -1); + if (!track.correctForMaterial(layerxX0[layer + iC], layerxX0[layer] * radl * rho, true)) { + break; + } + + const auto predChi2{track.getPredictedChi2Quiet(trackingHit.positionTrackingFrame, trackingHit.covarianceTrackingFrame)}; + if (!track.o2::track::TrackParCov::update(trackingHit.positionTrackingFrame, trackingHit.covarianceTrackingFrame)) { + break; + } + if (!iC && predChi2 > maxChi2ClusterAttachment) { + break; + } + good = !iC; + chi2 += predChi2; + } + if (!good) { + continue; + } + if constexpr (!initRun) { + new (cells + cellsLUTs[layer][iCurrentTrackletIndex] + foundCells) CellSeed{layer, clusId[0], clusId[1], clusId[2], iCurrentTrackletIndex, iNextTrackletIndex, track, chi2}; + } + ++foundCells; + if constexpr (initRun) { + cellsLUTs[layer][iCurrentTrackletIndex] = foundCells; + } + } + } } -}; +} -//////////////////////////////////////////////////////////////////////////////// -// Legacy Kernels, to possibly take inspiration from -//////////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////// +// Debug Kernels +///////////////////////////////////////// GPUd() const int4 getBinsRect(const Cluster& currentCluster, const int layerIndex, const o2::its::IndexTableUtils& utils, const float z1, const float z2, float maxdeltaz, float maxdeltaphi) @@ -304,6 +423,20 @@ GPUhd() float Sq(float q) return q * q; } +template +GPUd() void pPointer(T* ptr) +{ + printf("[%p]\t", ptr); +} +template +GPUg() void printPointersKernel(std::tuple args) +{ + auto print_all = [&](auto... ptrs) { + (pPointer(ptrs), ...); + }; + std::apply(print_all, args); +} + // Functors to sort tracklets template struct trackletSortEmptyFunctor : public thrust::binary_function { @@ -335,6 +468,32 @@ GPUg() void printBufferLayerOnThread(const int layer, const int* v, unsigned int } } +GPUg() void printMatrixRow(const int row, int** mat, const unsigned int rowLength, const int len = 150, const unsigned int tId = 0) +{ + if (blockIdx.x * blockDim.x + threadIdx.x == tId) { + for (int i{0}; i < rowLength; ++i) { + if (!(i % len)) { + printf("\n row %d: ===> %d/%d\t", row, i, (int)rowLength); + } + printf("%d\t", mat[row][i]); + } + printf("\n"); + } +} + +GPUg() void printBufferPointersLayerOnThread(const int layer, void** v, unsigned int size, const int len = 150, const unsigned int tId = 0) +{ + if (blockIdx.x * blockDim.x + threadIdx.x == tId) { + for (int i{0}; i < size; ++i) { + if (!(i % len)) { + printf("\n layer %d: ===> %d/%d\t", layer, i, (int)size); + } + printf("%p\t", (void*)v[i]); + } + printf("\n"); + } +} + // Dump vertices GPUg() void printVertices(const Vertex* v, unsigned int size, const unsigned int tId = 0) { @@ -642,99 +801,92 @@ GPUg() void removeDuplicateTrackletsEntriesLUTKernel( } } -// Compute cells kernel -template -GPUg() void computeLayerCellsKernel( - const Tracklet* trackletsCurrentLayer, - const Tracklet* trackletsNextLayer, - const int* trackletsCurrentLayerLUT, - const int nTrackletsCurrent, +} // namespace gpu + +void countCellsHandler( + const Cluster** sortedClusters, + const Cluster** unsortedClusters, + const TrackingFrameInfo** tfInfo, + const Tracklet** tracklets, + const int** trackletsLUT, + const int nTracklets, + const int layer, CellSeed* cells, - int* cellsLUTs, - const StaticTrackingParameters* trkPars) + int** cellsLUTsArrayDevice, + int* cellsLUTsHost, + const float bz, + const float maxChi2ClusterAttachment, + const float cellDeltaTanLambdaSigma, + const float nSigmaCut, + const int nBlocks, + const int nThreads) { - for (int iCurrentTrackletIndex = blockIdx.x * blockDim.x + threadIdx.x; iCurrentTrackletIndex < nTrackletsCurrent; iCurrentTrackletIndex += blockDim.x * gridDim.x) { - const Tracklet& currentTracklet = trackletsCurrentLayer[iCurrentTrackletIndex]; - const int nextLayerClusterIndex{currentTracklet.secondClusterIndex}; - const int nextLayerFirstTrackletIndex{trackletsCurrentLayerLUT[nextLayerClusterIndex]}; - const int nextLayerLastTrackletIndex{trackletsCurrentLayerLUT[nextLayerClusterIndex + 1]}; - if (nextLayerFirstTrackletIndex == nextLayerLastTrackletIndex) { - continue; - } - int foundCells{0}; - for (int iNextTrackletIndex{nextLayerFirstTrackletIndex}; iNextTrackletIndex < nextLayerLastTrackletIndex; ++iNextTrackletIndex) { - if (trackletsNextLayer[iNextTrackletIndex].firstClusterIndex != nextLayerClusterIndex) { - break; - } - const Tracklet& nextTracklet = trackletsNextLayer[iNextTrackletIndex]; - const float deltaTanLambda{o2::gpu::GPUCommonMath::Abs(currentTracklet.tanLambda - nextTracklet.tanLambda)}; - - if (deltaTanLambda / trkPars->CellDeltaTanLambdaSigma < trkPars->NSigmaCut) { - if constexpr (!initRun) { - new (cells + cellsLUTs[iCurrentTrackletIndex] + foundCells) Cell{currentTracklet.firstClusterIndex, nextTracklet.firstClusterIndex, - nextTracklet.secondClusterIndex, - iCurrentTrackletIndex, - iNextTrackletIndex}; - } - ++foundCells; - } - } - if constexpr (initRun) { - // Fill cell Lookup table - cellsLUTs[iCurrentTrackletIndex] = foundCells; - } - } + gpu::computeLayerCellsKernel<<>>( + sortedClusters, // const Cluster** + unsortedClusters, // const Cluster** + tfInfo, // const TrackingFrameInfo** + tracklets, // const Tracklets** + trackletsLUT, // const int** + nTracklets, // const int + layer, // const int + cells, // CellSeed* + cellsLUTsArrayDevice, // int** + bz, // const float + maxChi2ClusterAttachment, // const float + cellDeltaTanLambdaSigma, // const float + nSigmaCut); // const float + void* d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage + temp_storage_bytes, // temp_storage_bytes + cellsLUTsHost, // d_in + cellsLUTsHost, // d_out + nTracklets + 1, // num_items + 0)); + discardResult(cudaMalloc(&d_temp_storage, temp_storage_bytes)); + gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage + temp_storage_bytes, // temp_storage_bytes + cellsLUTsHost, // d_in + cellsLUTsHost, // d_out + nTracklets + 1, // num_items + 0)); + // gpu::printBufferLayerOnThread<<<1, 1>>>(layer, cellsLUTsHost, nTracklets + 1); + gpuCheckError(cudaFree(d_temp_storage)); } -template -GPUg() void computeLayerRoadsKernel( - const int level, - const int layerIndex, - CellSeed** cells, - const int* nCells, - int** neighbours, - int** neighboursLUT, - Road* roads, - int* roadsLookupTable) +void computeCellsHandler( + const Cluster** sortedClusters, + const Cluster** unsortedClusters, + const TrackingFrameInfo** tfInfo, + const Tracklet** tracklets, + const int** trackletsLUT, + const int nTracklets, + const int layer, + CellSeed* cells, + int** cellsLUTsArrayDevice, + int* cellsLUTsHost, + const float bz, + const float maxChi2ClusterAttachment, + const float cellDeltaTanLambdaSigma, + const float nSigmaCut, + const int nBlocks, + const int nThreads) { - for (int iCurrentCellIndex = blockIdx.x * blockDim.x + threadIdx.x; iCurrentCellIndex < nCells[layerIndex]; iCurrentCellIndex += blockDim.x * gridDim.x) { - auto& currentCell{cells[layerIndex][iCurrentCellIndex]}; - if (currentCell.getLevel() != level) { - continue; - } - int nRoadsCurrentCell{0}; - if constexpr (dryRun) { - roadsLookupTable[iCurrentCellIndex]++; - } else { - roads[roadsLookupTable[iCurrentCellIndex] + nRoadsCurrentCell++] = Road{layerIndex, iCurrentCellIndex}; - } - if (level == 1) { - continue; - } - - const auto currentCellNeighOffset{neighboursLUT[layerIndex - 1][iCurrentCellIndex]}; - const int cellNeighboursNum{neighboursLUT[layerIndex - 1][iCurrentCellIndex + 1] - currentCellNeighOffset}; - bool isFirstValidNeighbour{true}; - for (int iNeighbourCell{0}; iNeighbourCell < cellNeighboursNum; ++iNeighbourCell) { - const int neighbourCellId = neighbours[layerIndex - 1][currentCellNeighOffset + iNeighbourCell]; - const CellSeed& neighbourCell = cells[layerIndex - 1][neighbourCellId]; - if (level - 1 != neighbourCell.getLevel()) { - continue; - } - if (isFirstValidNeighbour) { - isFirstValidNeighbour = false; - } else { - if constexpr (dryRun) { - roadsLookupTable[iCurrentCellIndex]++; // dry run we just count the number of roads - } else { - roads[roadsLookupTable[iCurrentCellIndex] + nRoadsCurrentCell++] = Road{layerIndex, iCurrentCellIndex}; - } - } - // traverseCellsTreeDevice(neighbourCellId, layerIndex - 1, iCurrentCellIndex, nRoadsCurrentCell, roadsLookupTable, cells, roads); - } - } + gpu::computeLayerCellsKernel<<>>( + sortedClusters, // const Cluster** + unsortedClusters, // const Cluster** + tfInfo, // const TrackingFrameInfo** + tracklets, // const Tracklets** + trackletsLUT, // const int** + nTracklets, // const int + layer, // const int + cells, // CellSeed* + cellsLUTsArrayDevice, // int** + bz, // const float + maxChi2ClusterAttachment, // const float + cellDeltaTanLambdaSigma, // const float + nSigmaCut); // const float } -} // namespace gpu void countCellNeighboursHandler(CellSeed** cellsLayersDevice, int* neighboursLUT, @@ -866,16 +1018,16 @@ void trackSeedHandler(CellSeed* trackSeeds, const int nThreads) { gpu::fitTrackSeedsKernel<<>>( - trackSeeds, // CellSeed* trackSeeds, - foundTrackingFrameInfo, // TrackingFrameInfo** foundTrackingFrameInfo, - tracks, // o2::its::TrackITSExt* tracks, - nSeeds, // const unsigned int nSeeds, - Bz, // const float Bz, - startLevel, // const int startLevel, - maxChi2ClusterAttachment, // float maxChi2ClusterAttachment, - maxChi2NDF, // float maxChi2NDF, - propagator, // const o2::base::Propagator* propagator - matCorrType); // o2::base::PropagatorF::MatCorrType matCorrType + trackSeeds, // CellSeed* + foundTrackingFrameInfo, // TrackingFrameInfo** + tracks, // TrackITSExt* + nSeeds, // const unsigned int + Bz, // const float + startLevel, // const int + maxChi2ClusterAttachment, // float + maxChi2NDF, // float + propagator, // const o2::base::Propagator* + matCorrType); // o2::base::PropagatorF::MatCorrType gpuCheckError(cudaPeekAtLastError()); gpuCheckError(cudaDeviceSynchronize()); diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Cell.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Cell.h index 482bb38b19bad..cb9f28665cf07 100644 --- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Cell.h +++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Cell.h @@ -80,6 +80,7 @@ class CellSeed final : public o2::track::TrackParCovF public: GPUhdDefault() CellSeed() = default; GPUhdDefault() CellSeed(const CellSeed&) = default; + GPUhdDefault() ~CellSeed() = default; GPUd() CellSeed(int innerL, int cl0, int cl1, int cl2, int trkl0, int trkl1, o2::track::TrackParCovF& tpc, float chi2) : o2::track::TrackParCovF{tpc}, mLevel{1}, mChi2{chi2} { setUserField(innerL); diff --git a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx index 4457d4515e0a6..2ed7c77848227 100644 --- a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx +++ b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx @@ -13,6 +13,7 @@ /// \brief /// +// #define CA_DEBUG #include "ITStracking/TrackerTraits.h" #include @@ -477,6 +478,7 @@ void TrackerTraits::processNeighbours(int iLayer, int iLevel, const std::vector< exit(1); } CA_DEBUGGER(std::cout << "Processing neighbours layer " << iLayer << " level " << iLevel << ", size of the cell seeds: " << currentCellSeed.size() << std::endl); + CA_DEBUGGER(std::cout << " resizing: " << mTimeFrame->getCellsNeighboursLUT()[iLayer - 1].size() << std::endl); updatedCellSeeds.reserve(mTimeFrame->getCellsNeighboursLUT()[iLayer - 1].size()); /// This is not the correct value, we could do a loop to count the number of neighbours updatedCellsIds.reserve(updatedCellSeeds.size()); auto propagator = o2::base::Propagator::Instance(); @@ -488,6 +490,7 @@ void TrackerTraits::processNeighbours(int iLayer, int iLevel, const std::vector< for (unsigned int iCell = 0; iCell < currentCellSeed.size(); ++iCell) { const CellSeed& currentCell{currentCellSeed[iCell]}; if (currentCell.getLevel() != iLevel) { + CA_DEBUGGER(std::cout << " continuing." << std::endl); continue; } if (currentCellId.empty() && (mTimeFrame->isClusterUsed(iLayer, currentCell.getFirstClusterIndex()) ||