From f8824df4c81dc0c5d8e33587c41c875acbe6c0eb Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Thu, 5 Dec 2024 14:52:11 +0100 Subject: [PATCH] CP --- .../GPU/ITStrackingGPU/TimeFrameGPU.h | 11 +- .../GPU/ITStrackingGPU/TrackingKernels.h | 27 +- .../ITS/tracking/GPU/cuda/CMakeLists.txt | 2 +- .../ITS/tracking/GPU/cuda/TimeFrameGPU.cu | 38 ++- .../tracking/GPU/cuda/TrackerTraitsGPU.cxx | 45 ++- .../ITS/tracking/GPU/cuda/TrackingKernels.cu | 261 ++++++++++++++---- .../ITSMFT/ITS/tracking/src/TrackerTraits.cxx | 7 +- 7 files changed, 306 insertions(+), 85 deletions(-) diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h index 37f392ebbd3a7..c5afb4a7371f5 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h @@ -76,9 +76,10 @@ class TimeFrameGPU : public TimeFrame void createCellsBuffers(const int); void createCellsDevice(); void createCellsLUTDevice(); - void createNeighboursDevice(); + void createNeighboursIndexTablesDevice(); void createNeighboursDevice(const unsigned int& layer, std::vector>& neighbours); void createNeighboursLUTDevice(const int, const unsigned int); + void createNeighboursDeviceArray(); void createTrackITSExtDevice(std::vector&); void downloadTrackITSExtDevice(std::vector&); void downloadCellsNeighboursDevice(std::vector>>&, const int); @@ -113,7 +114,9 @@ class TimeFrameGPU : public TimeFrame Road* getDeviceRoads() { return mRoadsDevice; } TrackITSExt* getDeviceTrackITSExt() { return mTrackITSExtDevice; } int* getDeviceNeighboursLUT(const int layer) { return mNeighboursLUTDevice[layer]; } - gpuPair* getDeviceNeighbours(const int layer) { return mNeighboursDevice[layer]; } + gpuPair* getDeviceNeighbourPairs(const int layer) { return mNeighbourPairsDevice[layer]; } + int* getDeviceNeighbours(const int layer) { return mNeighboursDevice[layer]; } + int** getDeviceNeighboursArray() { return mNeighboursDeviceArray; } TrackingFrameInfo* getDeviceTrackingFrameInfo(const int); const TrackingFrameInfo** getDeviceArrayTrackingFrameInfo() const { return mTrackingFrameInfoDeviceArray; } const Cluster** getDeviceArrayClusters() const { return mClustersDeviceArray; } @@ -195,7 +198,9 @@ class TimeFrameGPU : public TimeFrame Road* mRoadsDevice; TrackITSExt* mTrackITSExtDevice; - std::array*, nLayers - 2> mNeighboursDevice; + std::array*, nLayers - 2> mNeighbourPairsDevice; + std::array mNeighboursDevice; + int** mNeighboursDeviceArray; std::array mTrackingFrameInfoDevice; const TrackingFrameInfo** mTrackingFrameInfoDeviceArray; diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h index 54bdae302e643..20632a2c50371 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h @@ -175,9 +175,30 @@ void computeCellNeighboursHandler(CellSeed** cellsLayersDevice, const int nBlocks, const int nThreads); -void filterCellNeighboursHandler(std::vector&, - gpuPair*, - unsigned int); +int filterCellNeighboursHandler(std::vector&, + gpuPair*, + int*, + unsigned int); + +void processNeighboursHandler(const int iteration, + const int startLayer, + const int startLevel, + CellSeed** currentCellSeeds, + const unsigned int nCurrentCells, + // const int* currentCellIds, + // const unsigned int nCurrentCellsIds, + // CellSeed* updatedCellSeeds, + // int* updatedCellsIds, + const unsigned char** usedClusters, // Used clusters + int* neighbours, + int* neighboursLUT, + const TrackingFrameInfo** foundTrackingFrameInfo, + const float bz, + const float MaxChi2ClusterAttachment, + const o2::base::Propagator* propagator, + const o2::base::PropagatorF::MatCorrType matCorrType, + const int nBlocks, + const int nThreads); void trackSeedHandler(CellSeed* trackSeeds, const TrackingFrameInfo** foundTrackingFrameInfo, diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt index 3cdb107e07438..e2fc1f1388ad0 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt @@ -13,7 +13,7 @@ if(CUDA_ENABLED) find_package(CUDAToolkit) message(STATUS "Building ITS CUDA tracker") -# add_compile_options(-O0 -g -lineinfo -fPIC) +add_compile_options(-O0 -g -lineinfo -fPIC) # add_compile_definitions(ITS_MEASURE_GPU_TIME) o2_add_library(ITStrackingCUDA SOURCES ClusterLinesGPU.cu diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu index 4bd15c0203d81..fd067b9930fd0 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu @@ -306,17 +306,28 @@ void TimeFrameGPU::loadTrackletsLUTDevice() } template -void TimeFrameGPU::createNeighboursDevice() +void TimeFrameGPU::createNeighboursIndexTablesDevice() { - START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading cell seeds"); + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells neighbours"); + // Here we do also the creation of the CellsDeviceArray, as the cells buffers are populated separately in the previous steps. + allocMemAsync(reinterpret_cast(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), nullptr, getExtAllocator()); + checkGPUError(cudaHostRegister(mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaHostRegisterPortable)); + checkGPUError(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) { LOGP(debug, "gpu-transfer: loading neighbours LUT for {} elements on layer {}, for {} MB.", mNCells[iLayer], iLayer, mNCells[iLayer] * sizeof(CellSeed) / MB); allocMemAsync(reinterpret_cast(&mNeighboursIndexTablesDevice[iLayer]), (mNCells[iLayer] + 1) * sizeof(int), nullptr, getExtAllocator()); checkGPUError(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (mNCells[iLayer] + 1) * sizeof(int), mGpuStreams[0].get())); } - allocMemAsync(reinterpret_cast(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), nullptr, getExtAllocator()); - checkGPUError(cudaHostRegister(mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaHostRegisterPortable)); - checkGPUError(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); +} + +template +void TimeFrameGPU::createNeighboursLUTDevice(const int layer, const unsigned int nCells) +{ + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighboursLUT"); + LOGP(debug, "gpu-allocation: reserving neighbours LUT for {} elements on layer {} , for {} MB.", nCells + 1, layer, (nCells + 1) * sizeof(int) / MB); + allocMemAsync(reinterpret_cast(&mNeighboursLUTDevice[layer]), (nCells + 1) * sizeof(int), nullptr, getExtAllocator()); // We need one element more to move exc -> inc + checkGPUError(cudaMemsetAsync(mNeighboursLUTDevice[layer], 0, (nCells + 1) * sizeof(int), mGpuStreams[0].get())); STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } @@ -400,19 +411,20 @@ void TimeFrameGPU::createNeighboursDevice(const unsigned int& layer, st START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighbours"); mCellsNeighbours[layer].clear(); mCellsNeighbours[layer].resize(neighbours.size()); + LOGP(debug, "gpu-allocation: reserving {} neighbours (pairs), for {} MB.", neighbours.size(), neighbours.size() * sizeof(gpuPair) / MB); + allocMemAsync(reinterpret_cast(&mNeighbourPairsDevice[layer]), neighbours.size() * sizeof(gpuPair), &(mGpuStreams[0]), getExtAllocator()); + checkGPUError(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, neighbours.size() * sizeof(gpuPair), mGpuStreams[0].get())); LOGP(debug, "gpu-allocation: reserving {} neighbours, for {} MB.", neighbours.size(), neighbours.size() * sizeof(gpuPair) / MB); - allocMemAsync(reinterpret_cast(&mNeighboursDevice[layer]), neighbours.size() * sizeof(gpuPair), &(mGpuStreams[0]), getExtAllocator()); - checkGPUError(cudaMemsetAsync(mNeighboursDevice[layer], -1, neighbours.size() * sizeof(gpuPair), mGpuStreams[0].get())); + allocMemAsync(reinterpret_cast(&mNeighboursDevice[layer]), neighbours.size() * sizeof(int), &(mGpuStreams[0]), getExtAllocator()); STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template -void TimeFrameGPU::createNeighboursLUTDevice(const int layer, const unsigned int nCells) +void TimeFrameGPU::createNeighboursDeviceArray() { - START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighboursLUT"); - LOGP(debug, "gpu-allocation: reserving neighbours LUT for {} elements on layer {} , for {} MB.", nCells + 1, layer, (nCells + 1) * sizeof(int) / MB); - allocMemAsync(reinterpret_cast(&mNeighboursLUTDevice[layer]), (nCells + 1) * sizeof(int), nullptr, getExtAllocator()); // We need one element more to move exc -> inc - checkGPUError(cudaMemsetAsync(mNeighboursLUTDevice[layer], 0, (nCells + 1) * sizeof(int), mGpuStreams[0].get())); + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighbours"); + allocMemAsync(reinterpret_cast(&mNeighboursDeviceArray), (nLayers - 2) * sizeof(int*), &(mGpuStreams[0]), getExtAllocator()); + checkGPUError(cudaMemcpyAsync(mNeighboursDeviceArray, mNeighboursDevice.data(), (nLayers - 2) * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } @@ -459,7 +471,7 @@ void TimeFrameGPU::downloadCellsNeighboursDevice(std::vector) / MB); // TODO: something less dangerous than assuming the same memory layout of std::pair and gpuPair... or not? :) - checkGPUError(cudaMemcpyAsync(neighbours[layer].data(), mNeighboursDevice[layer], neighbours[layer].size() * sizeof(gpuPair), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); + checkGPUError(cudaMemcpyAsync(neighbours[layer].data(), mNeighbourPairsDevice[layer], neighbours[layer].size() * sizeof(gpuPair), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); } template diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx index ae86507e46325..d46e81421879f 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx @@ -214,7 +214,7 @@ void TrackerTraitsGPU::computeCellsHybrid(const int iteration) template void TrackerTraitsGPU::findCellsNeighboursHybrid(const int iteration) { - mTimeFrameGPU->createNeighboursDevice(); + mTimeFrameGPU->createNeighboursIndexTablesDevice(); auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance(); std::vector>> cellsNeighboursLayer(mTrkParams[iteration].CellsPerRoad() - 1); for (int iLayer{0}; iLayer < mTrkParams[iteration].CellsPerRoad() - 1; ++iLayer) { @@ -228,17 +228,16 @@ void TrackerTraitsGPU::findCellsNeighboursHybrid(const int iteration) continue; } - int layerCellsNum{static_cast(mTimeFrameGPU->getCells()[iLayer].size())}; mTimeFrameGPU->createNeighboursLUTDevice(iLayer, nextLayerCellsNum); countCellNeighboursHandler(mTimeFrameGPU->getDeviceArrayCells(), mTimeFrameGPU->getDeviceNeighboursLUT(iLayer), // LUT is initialised here. mTimeFrameGPU->getDeviceArrayCellsLUT(), - mTimeFrameGPU->getDeviceNeighbours(iLayer), + mTimeFrameGPU->getDeviceNeighbourPairs(iLayer), mTimeFrameGPU->getDeviceNeighboursIndexTables(iLayer), mTrkParams[0].MaxChi2ClusterAttachment, mBz, iLayer, - layerCellsNum, + mTimeFrameGPU->getNCells()[iLayer], nextLayerCellsNum, 1e2, conf.nBlocks, @@ -250,12 +249,12 @@ void TrackerTraitsGPU::findCellsNeighboursHybrid(const int iteration) computeCellNeighboursHandler(mTimeFrameGPU->getDeviceArrayCells(), mTimeFrameGPU->getDeviceNeighboursLUT(iLayer), mTimeFrameGPU->getDeviceArrayCellsLUT(), - mTimeFrameGPU->getDeviceNeighbours(iLayer), + mTimeFrameGPU->getDeviceNeighbourPairs(iLayer), mTimeFrameGPU->getDeviceNeighboursIndexTables(iLayer), mTrkParams[0].MaxChi2ClusterAttachment, mBz, iLayer, - layerCellsNum, + mTimeFrameGPU->getNCells()[iLayer], nextLayerCellsNum, 1e2, conf.nBlocks, @@ -264,9 +263,11 @@ void TrackerTraitsGPU::findCellsNeighboursHybrid(const int iteration) mTimeFrameGPU->getCellsNeighbours()[iLayer].reserve(cellsNeighboursLayer[iLayer].size()); filterCellNeighboursHandler(mTimeFrameGPU->getCellsNeighbours()[iLayer], + mTimeFrameGPU->getDeviceNeighbourPairs(iLayer), mTimeFrameGPU->getDeviceNeighbours(iLayer), cellsNeighboursLayer[iLayer].size()); } + mTimeFrameGPU->createNeighboursDeviceArray(); mTimeFrameGPU->downloadCellsDevice(); mTimeFrameGPU->unregisterRest(); }; @@ -274,10 +275,26 @@ void TrackerTraitsGPU::findCellsNeighboursHybrid(const int iteration) template void TrackerTraitsGPU::findRoads(const int iteration) { + auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance(); for (int startLevel{mTrkParams[iteration].CellsPerRoad()}; startLevel >= mTrkParams[iteration].CellMinimumLevel(); --startLevel) { const int minimumLayer{startLevel - 1}; std::vector trackSeeds; for (int startLayer{mTrkParams[iteration].CellsPerRoad() - 1}; startLayer >= minimumLayer; --startLayer) { + processNeighboursHandler(iteration, + startLayer, + startLevel, + mTimeFrameGPU->getDeviceArrayCells(), + mTimeFrameGPU->getNCells()[startLayer], + mTimeFrameGPU->getDeviceArrayUsedClusters(), + mTimeFrameGPU->getDeviceNeighbours(startLayer - 1), + mTimeFrameGPU->getDeviceNeighboursLUT(startLayer - 1), + mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(), + mBz, + mTrkParams[0].MaxChi2ClusterAttachment, // float maxChi2ClusterAttachment + mTimeFrameGPU->getDevicePropagator(), + mCorrType, + conf.nBlocks, + conf.nThreads); std::vector lastCellId, updatedCellId; std::vector lastCellSeed, updatedCellSeed; @@ -304,15 +321,15 @@ void TrackerTraitsGPU::findRoads(const int iteration) } mTimeFrameGPU->createTrackITSExtDevice(trackSeeds); mTimeFrameGPU->loadTrackSeedsDevice(trackSeeds); - auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance(); - trackSeedHandler(mTimeFrameGPU->getDeviceTrackSeeds(), // CellSeed* trackSeeds, - mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(), // TrackingFrameInfo** foundTrackingFrameInfo, - mTimeFrameGPU->getDeviceTrackITSExt(), // o2::its::TrackITSExt* tracks, - trackSeeds.size(), // const size_t nSeeds, - mBz, // const float Bz, + + trackSeedHandler(mTimeFrameGPU->getDeviceTrackSeeds(), // CellSeed* trackSeeds + mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(), // TrackingFrameInfo** foundTrackingFrameInfo + mTimeFrameGPU->getDeviceTrackITSExt(), // o2::its::TrackITSExt* tracks + trackSeeds.size(), // const size_t nSeeds + mBz, // const float Bz startLevel, // const int startLevel, - mTrkParams[0].MaxChi2ClusterAttachment, // float maxChi2ClusterAttachment, - mTrkParams[0].MaxChi2NDF, // float maxChi2NDF, + mTrkParams[0].MaxChi2ClusterAttachment, // float maxChi2ClusterAttachment + mTrkParams[0].MaxChi2NDF, // float maxChi2NDF mTimeFrameGPU->getDevicePropagator(), // const o2::base::Propagator* propagator mCorrType, // o2::base::PropagatorImpl::MatCorrType conf.nBlocks, diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu index 229827611c077..046f45e8f4fb2 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu @@ -112,7 +112,7 @@ GPUd() bool fitTrack(TrackITSExt& track, float chi2ndfcut, float maxQoverPt, int nCl, - float Bz, + float bz, const TrackingFrameInfo** tfInfos, const o2::base::Propagator* prop, o2::base::PropagatorF::MatCorrType matCorrType) @@ -128,7 +128,7 @@ GPUd() bool fitTrack(TrackITSExt& track, if (!prop->propagateToX(track, trackingHit.xTrackingFrame, - Bz, + bz, o2::base::PropagatorImpl::MAX_SIN_PHI, o2::base::PropagatorImpl::MAX_STEP, matCorrType)) { @@ -268,7 +268,7 @@ GPUg() void fitTrackSeedsKernel( const TrackingFrameInfo** foundTrackingFrameInfo, o2::its::TrackITSExt* tracks, const unsigned int nSeeds, - const float Bz, + const float bz, const int startLevel, float maxChi2ClusterAttachment, float maxChi2NDF, @@ -294,7 +294,7 @@ GPUg() void fitTrackSeedsKernel( maxChi2NDF, // float maxChi2NDF, o2::constants::math::VeryBig, // float maxQoverPt, 0, // nCl, - Bz, // float Bz, + bz, // float bz, foundTrackingFrameInfo, // TrackingFrameInfo** trackingFrameInfo, propagator, // const o2::base::Propagator* propagator, matCorrType); // o2::base::PropagatorF::MatCorrType matCorrType @@ -313,7 +313,7 @@ GPUg() void fitTrackSeedsKernel( maxChi2NDF, // float maxChi2NDF, 50.f, // float maxQoverPt, 0, // nCl, - Bz, // float Bz, + bz, // float bz, foundTrackingFrameInfo, // TrackingFrameInfo** trackingFrameInfo, propagator, // const o2::base::Propagator* propagator, matCorrType); // o2::base::PropagatorF::MatCorrType matCorrType @@ -574,6 +574,107 @@ GPUg() void computeLayerTrackletsMultiROFKernel( } } +template +GPUg() void compileTrackletsLookupTableKernel(const Tracklet* tracklets, + int* trackletsLookUpTable, + const int nTracklets) +{ + for (int currentTrackletIndex = blockIdx.x * blockDim.x + threadIdx.x; currentTrackletIndex < nTracklets; currentTrackletIndex += blockDim.x * gridDim.x) { + atomicAdd(&trackletsLookUpTable[tracklets[currentTrackletIndex].firstClusterIndex], 1); + } +} + +template +GPUg() void processNeighboursKernel(const int iteration, + const int layer, + const int level, + CellSeed** currentCellSeeds, + const unsigned int nCurrentCells, + const int* currentCellIds, + // const unsigned int nCurrentCellsIds, + CellSeed* updatedCellSeeds, + int* updatedCellsIds, + int* foundSeedsTable, // auxiliary only in GPU code to compute the number of cells per iteration + const unsigned char** usedClusters, // Used clusters + int* neighbours, + int* neighboursLUT, + const TrackingFrameInfo** foundTrackingFrameInfo, + const float bz, + const float MaxChi2ClusterAttachment, + const o2::base::Propagator* propagator, + const o2::base::PropagatorF::MatCorrType matCorrType) +{ + constexpr float radl = 9.36f; // Radiation length of Si [cm]. + constexpr float rho = 2.33f; // Density of Si [g/cm^3]. + constexpr float layerxX0[7] = {5.e-3f, 5.e-3f, 5.e-3f, 1.e-2f, 1.e-2f, 1.e-2f, 1.e-2f}; // Hardcoded here for the moment. + for (unsigned int iCurrentCell = blockIdx.x * blockDim.x + threadIdx.x; iCurrentCell < nCurrentCells; iCurrentCell += blockDim.x * gridDim.x) { + int foundSeeds{0}; + const auto& currentCell{currentCellSeeds[layer][iCurrentCell]}; + if (currentCell.getLevel() != level) { + continue; + } + if (currentCellIds == nullptr && (usedClusters[layer][currentCell.getFirstClusterIndex()] || + usedClusters[layer + 1][currentCell.getSecondClusterIndex()] || + usedClusters[layer + 2][currentCell.getThirdClusterIndex()])) { + continue; + } + const int cellId = currentCellIds == nullptr ? iCurrentCell : currentCellIds[iCurrentCell]; + const int startNeighbourId{cellId ? neighboursLUT[cellId - 1] : 0}; + const int endNeighbourId{neighboursLUT[cellId]}; + + for (int iNeighbourCell{startNeighbourId}; iNeighbourCell < endNeighbourId; ++iNeighbourCell) { + const int neighbourCellId = neighbours[iNeighbourCell]; + const CellSeed& neighbourCell = currentCellSeeds[layer - 1][neighbourCellId]; + if (neighbourCell.getSecondTrackletIndex() != currentCell.getFirstTrackletIndex()) { + continue; + } + if (usedClusters[layer - 1][neighbourCell.getFirstClusterIndex()]) { + continue; + } + if (currentCell.getLevel() - 1 != neighbourCell.getLevel()) { + continue; + } + CellSeed seed{currentCell}; + auto& trHit = foundTrackingFrameInfo[layer - 1][neighbourCell.getFirstClusterIndex()]; + + if (!seed.rotate(trHit.alphaTrackingFrame)) { + continue; + } + + if (!propagator->propagateToX(seed, trHit.xTrackingFrame, bz, o2::base::PropagatorImpl::MAX_SIN_PHI, o2::base::PropagatorImpl::MAX_STEP, matCorrType)) { + continue; + } + + if (matCorrType == o2::base::PropagatorF::MatCorrType::USEMatCorrNONE) { + if (!seed.correctForMaterial(layerxX0[layer - 1], layerxX0[layer - 1] * radl * rho, true)) { + continue; + } + } + + auto predChi2{seed.getPredictedChi2Quiet(trHit.positionTrackingFrame, trHit.covarianceTrackingFrame)}; + if ((predChi2 > MaxChi2ClusterAttachment) || predChi2 < 0.f) { + continue; + } + seed.setChi2(seed.getChi2() + predChi2); + if (!seed.o2::track::TrackParCov::update(trHit.positionTrackingFrame, trHit.covarianceTrackingFrame)) { + CA_DEBUGGER(failed[4]++); + continue; + } + seed.getClusters()[layer - 1] = neighbourCell.getFirstClusterIndex(); + seed.setLevel(neighbourCell.getLevel()); + seed.setFirstTrackletIndex(neighbourCell.getFirstTrackletIndex()); + seed.setSecondTrackletIndex(neighbourCell.getSecondTrackletIndex()); + if constexpr (dryRun) { + foundSeedsTable[iCurrentCell]++; + } else { + updatedCellsIds[foundSeedsTable[iCurrentCell] + foundSeeds] = neighbourCellId; + updatedCellSeeds[foundSeedsTable[iCurrentCell] + foundSeeds] = seed; + } + foundSeeds++; + } + } +} + ///////////////////////////////////////// // Debug Kernels ///////////////////////////////////////// @@ -690,36 +791,6 @@ GPUg() void printTrackletsLUTPerROF(const int layerId, } } } - -template -GPUg() void compileTrackletsLookupTableKernel(const Tracklet* tracklets, - int* trackletsLookUpTable, - const int nTracklets) -{ - for (int currentTrackletIndex = blockIdx.x * blockDim.x + threadIdx.x; currentTrackletIndex < nTracklets; currentTrackletIndex += blockDim.x * gridDim.x) { - atomicAdd(&trackletsLookUpTable[tracklets[currentTrackletIndex].firstClusterIndex], 1); - } -} - -// Decrease LUT entries corresponding to duplicated tracklets. NB: duplicate tracklets are removed separately (see const Tracklets*). -GPUg() void removeDuplicateTrackletsEntriesLUTKernel( - int* trackletsLookUpTable, - const Tracklet* tracklets, - const int* nTracklets, - const int layerIndex) -{ - int id0{-1}, id1{-1}; - for (int iTracklet{0}; iTracklet < nTracklets[layerIndex]; ++iTracklet) { - auto& trk = tracklets[iTracklet]; - if (trk.firstClusterIndex == id0 && trk.secondClusterIndex == id1) { - trackletsLookUpTable[id0]--; - } else { - id0 = trk.firstClusterIndex; - id1 = trk.secondClusterIndex; - } - } -} - } // namespace gpu template @@ -995,8 +1066,8 @@ void countCellNeighboursHandler(CellSeed** cellsLayersDevice, layerIndex, nCells, maxCellNeighbours); - gpuCheckError(cudaPeekAtLastError()); - gpuCheckError(cudaDeviceSynchronize()); + // gpuCheckError(cudaPeekAtLastError()); + // gpuCheckError(cudaDeviceSynchronize()); void *d_temp_storage = nullptr, *d_temp_storage_2 = nullptr; size_t temp_storage_bytes = 0, temp_storage_bytes_2 = 0; gpuCheckError(cub::DeviceScan::InclusiveSum(d_temp_storage, // d_temp_storage @@ -1061,36 +1132,126 @@ void computeCellNeighboursHandler(CellSeed** cellsLayersDevice, gpuCheckError(cudaDeviceSynchronize()); } -void filterCellNeighboursHandler(std::vector& neighHost, - gpuPair* cellNeighbours, - unsigned int nNeigh) +int filterCellNeighboursHandler(std::vector& neighHost, // TODO: eventually remove this! + gpuPair* cellNeighbourPairs, + int* cellNeighbours, + unsigned int nNeigh) { - thrust::device_ptr> neighVector(cellNeighbours); + thrust::device_ptr> neighVectorPairs(cellNeighbourPairs); + thrust::device_ptr validNeighs(cellNeighbours); thrust::device_vector keys(nNeigh); // TODO: externally allocate. thrust::device_vector vals(nNeigh); // TODO: externally allocate. - thrust::copy(thrust::make_transform_iterator(neighVector, gpu::pair_to_second()), - thrust::make_transform_iterator(neighVector + nNeigh, gpu::pair_to_second()), + thrust::copy(thrust::make_transform_iterator(neighVectorPairs, gpu::pair_to_second()), + thrust::make_transform_iterator(neighVectorPairs + nNeigh, gpu::pair_to_second()), keys.begin()); thrust::sequence(vals.begin(), vals.end()); thrust::sort_by_key(keys.begin(), keys.end(), vals.begin()); thrust::device_vector> sortedNeigh(nNeigh); - thrust::copy(thrust::make_permutation_iterator(neighVector, vals.begin()), - thrust::make_permutation_iterator(neighVector, vals.end()), + thrust::copy(thrust::make_permutation_iterator(neighVectorPairs, vals.begin()), + thrust::make_permutation_iterator(neighVectorPairs, vals.end()), sortedNeigh.begin()); discardResult(cudaDeviceSynchronize()); auto trimmedBegin = thrust::find_if(sortedNeigh.begin(), sortedNeigh.end(), gpu::is_valid_pair()); // trim leading -1s auto trimmedSize = sortedNeigh.end() - trimmedBegin; - thrust::device_vector validNeigh(trimmedSize); neighHost.resize(trimmedSize); - thrust::transform(trimmedBegin, sortedNeigh.end(), validNeigh.begin(), gpu::pair_to_first()); - gpuCheckError(cudaMemcpy(neighHost.data(), thrust::raw_pointer_cast(validNeigh.data()), trimmedSize * sizeof(int), cudaMemcpyDeviceToHost)); + thrust::transform(trimmedBegin, sortedNeigh.end(), validNeighs, gpu::pair_to_first()); + gpuCheckError(cudaMemcpy(neighHost.data(), cellNeighbours, trimmedSize * sizeof(int), cudaMemcpyDeviceToHost)); + + return trimmedSize; +} + +void processNeighboursHandler(const int iteration, + const int startLayer, + const int startLevel, + CellSeed** currentCellSeeds, + const unsigned int nCurrentCells, + // const int* currentCellIds, + // const unsigned int nCurrentCellsIds, + // CellSeed* updatedCellSeeds, + // int* updatedCellsIds, + const unsigned char** usedClusters, // Used clusters + int* neighbours, + int* neighboursLUT, + const TrackingFrameInfo** foundTrackingFrameInfo, + const float bz, + const float MaxChi2ClusterAttachment, + const o2::base::Propagator* propagator, + const o2::base::PropagatorF::MatCorrType matCorrType, + const int nBlocks, + const int nThreads) +{ + thrust::device_vector foundSeedsTable(nCurrentCells + 1); // Shortcut: device_vector skips central memory management, we are relying on the contingency. TODO: fix this. + gpu::processNeighboursKernel<<>>(iteration, + startLayer, + startLevel, + currentCellSeeds, + nCurrentCells, + nullptr, // currentCellIds, + nullptr, // updatedCellSeeds, + nullptr, // updatedCellsIds, + thrust::raw_pointer_cast(&foundSeedsTable[0]), // auxiliary only in GPU code to compute the number of cells per iteration + usedClusters, // Used clusters + neighbours, + neighboursLUT, + foundTrackingFrameInfo, + bz, + MaxChi2ClusterAttachment, + propagator, + matCorrType); + void *d_temp_storage = nullptr, *d_temp_storage_2 = nullptr; + size_t temp_storage_bytes = 0, temp_storage_bytes_2 = 0; + gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage + temp_storage_bytes, // temp_storage_bytes + thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_in + thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_out + nCurrentCells + 1, // num_items + 0)); + discardResult(cudaMalloc(&d_temp_storage, temp_storage_bytes)); + gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage + temp_storage_bytes, // temp_storage_bytes + thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_in + thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_out + nCurrentCells + 1, // num_items + 0)); + + thrust::device_vector updatedCellIds(foundSeedsTable.back()), lastCellsIds(foundSeedsTable.back()); + thrust::device_vector updatedCellSeeds(foundSeedsTable.back()), lastCellSeeds(foundSeedsTable.back()); + + gpu::processNeighboursKernel<<<1, 1>>>(iteration, + startLayer, + startLevel, + currentCellSeeds, + nCurrentCells, + nullptr, // currentCellIds, + thrust::raw_pointer_cast(&updatedCellSeeds[0]), // updatedCellSeeds + thrust::raw_pointer_cast(&updatedCellIds[0]), // updatedCellsIds + thrust::raw_pointer_cast(&foundSeedsTable[0]), // auxiliary only in GPU code to compute the number of cells per iteration + usedClusters, // Used clusters + neighbours, + neighboursLUT, + foundTrackingFrameInfo, + bz, + MaxChi2ClusterAttachment, + propagator, + matCorrType); + int level = startLevel; + for (int iLayer{startLayer - 1}; iLayer > 0 && level > 2; --iLayer) { + lastCellSeeds.swap(updatedCellSeeds); + lastCellIds.swap(updatedCellIds); + std::vector().swap(updatedCellSeed); /// tame the memory peaks + updatedCellId.clear(); + // processNeighbours(iLayer, --level, lastCellSeed, lastCellId, updatedCellSeed, updatedCellId); + } + + gpuCheckError(cudaFree(d_temp_storage)); + gpuCheckError(cudaFree(d_temp_storage_2)); } void trackSeedHandler(CellSeed* trackSeeds, const TrackingFrameInfo** foundTrackingFrameInfo, o2::its::TrackITSExt* tracks, const unsigned int nSeeds, - const float Bz, + const float bz, const int startLevel, float maxChi2ClusterAttachment, float maxChi2NDF, @@ -1104,7 +1265,7 @@ void trackSeedHandler(CellSeed* trackSeeds, foundTrackingFrameInfo, // TrackingFrameInfo** tracks, // TrackITSExt* nSeeds, // const unsigned int - Bz, // const float + bz, // const float startLevel, // const int maxChi2ClusterAttachment, // float maxChi2NDF, // float diff --git a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx index 409b20ea23235..39e54785916ae 100644 --- a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx +++ b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx @@ -467,7 +467,12 @@ void TrackerTraits::findCellsNeighbours(const int iteration) } } -void TrackerTraits::processNeighbours(int iLayer, int iLevel, const std::vector& currentCellSeed, const std::vector& currentCellId, std::vector& updatedCellSeeds, std::vector& updatedCellsIds) +void TrackerTraits::processNeighbours(int iLayer, + int iLevel, + const std::vector& currentCellSeed, + const std::vector& currentCellId, + std::vector& updatedCellSeeds, + std::vector& updatedCellsIds) { if (iLevel < 2 || iLayer < 1) { std::cout << "Error: layer " << iLayer << " or level " << iLevel << " cannot be processed by processNeighbours" << std::endl;