From ff486d4c5b68d39204390ee2e1937d20aa9cb9dd Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Mon, 2 Dec 2024 18:59:02 +0100 Subject: [PATCH] tracklets on gpu --- .../GPU/ITStrackingGPU/TimeFrameGPU.h | 3 +- .../GPU/ITStrackingGPU/TrackingKernels.h | 3 ++ .../ITS/tracking/GPU/cuda/TimeFrameGPU.cu | 7 --- .../tracking/GPU/cuda/TrackerTraitsGPU.cxx | 32 ++++++------ .../ITS/tracking/GPU/cuda/TrackingKernels.cu | 49 ++++++++++++++++--- .../ITSMFT/ITS/tracking/src/TrackerTraits.cxx | 42 +--------------- 6 files changed, 66 insertions(+), 70 deletions(-) diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h index e288fbae98396..4b9256253cc4e 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h @@ -138,13 +138,14 @@ class TimeFrameGPU : public TimeFrame // Host-specific getters gsl::span getHostNTracklets(const int chunkId); gsl::span getHostNCells(const int chunkId); + gsl::span getNTracklets() { return mNTracklets; } + gsl::span getNCells() { return mNCells; } // Host-available device getters gsl::span getDeviceTrackletsLUTs() { return mTrackletsLUTDevice; } gsl::span getDeviceCellLUTs() { return mCellsLUTDevice; } gsl::span getDeviceTracklet() { return mTrackletsDevice; } gsl::span getDeviceCells() { return mCellsDevice; } - gsl::span getNCellsDevice() { return mNCells; } private: void allocMemAsync(void**, size_t, Stream*, bool); // Abstract owned and unowned memory allocations diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h index 473f338aa4200..54bdae302e643 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h @@ -97,7 +97,10 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils, const unsigned char** usedClusters, const int** clustersIndexTables, Tracklet** tracklets, + gsl::span spanTracklets, + gsl::span nTracklets, int** trackletsLUTs, + gsl::span trackletsLUTsHost, const int iteration, const float NSigmaCut, std::vector& phiCuts, diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu index 129060758ae2d..66180a9d14d95 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu @@ -489,13 +489,6 @@ void TimeFrameGPU::unregisterRest() LOGP(debug, "unregistering rest of the host memory..."); checkGPUError(cudaHostUnregister(mCellsDevice.data())); checkGPUError(cudaHostUnregister(mTrackletsDevice.data())); - checkGPUError(cudaHostUnregister(mTrackletsLUTDevice.data())); - for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) { - if (iLayer < nLayers - 2) { - checkGPUError(cudaHostUnregister(mTrackletsLookupTable[iLayer].data())); - } - checkGPUError(cudaHostUnregister(mTracklets[iLayer].data())); - } STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx index d4ef4322cd43f..8db849daa49c3 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx @@ -144,7 +144,10 @@ void TrackerTraitsGPU::computeTrackletsHybrid(const int iteration, int mTimeFrameGPU->getDeviceArrayUsedClusters(), mTimeFrameGPU->getDeviceArrayClustersIndexTables(), mTimeFrameGPU->getDeviceArrayTracklets(), + mTimeFrameGPU->getDeviceTracklet(), + mTimeFrameGPU->getNTracklets(), mTimeFrameGPU->getDeviceArrayTrackletsLUT(), + mTimeFrameGPU->getDeviceTrackletsLUTs(), iteration, mTrkParams[iteration].NSigmaCut, mTimeFrameGPU->getPhiCuts(), @@ -161,25 +164,22 @@ void TrackerTraitsGPU::computeTrackletsHybrid(const int iteration, int template void TrackerTraitsGPU::computeCellsHybrid(const int iteration) { - mTimeFrameGPU->loadTrackletsDevice(); - mTimeFrameGPU->loadTrackletsLUTDevice(); mTimeFrameGPU->createCellsLUTDevice(); auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance(); - // #pragma omp parallel for num_threads(nLayers) for (int iLayer = 0; iLayer < mTrkParams[iteration].CellsPerRoad(); ++iLayer) { - if (mTimeFrameGPU->getTracklets()[iLayer + 1].empty() || - mTimeFrameGPU->getTracklets()[iLayer].empty()) { + if (!mTimeFrameGPU->getNTracklets()[iLayer + 1] || !mTimeFrameGPU->getNTracklets()[iLayer]) { + LOGP(info, "continuing here"); continue; } - - const int currentLayerTrackletsNum{static_cast(mTimeFrameGPU->getTracklets()[iLayer].size())}; + LOGP(info, "+> {}", mTimeFrameGPU->getNTracklets()[iLayer]); + const int currentLayerTrackletsNum{static_cast(mTimeFrameGPU->getNTracklets()[iLayer])}; countCellsHandler(mTimeFrameGPU->getDeviceArrayClusters(), mTimeFrameGPU->getDeviceArrayUnsortedClusters(), mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(), mTimeFrameGPU->getDeviceArrayTracklets(), mTimeFrameGPU->getDeviceArrayTrackletsLUT(), - mTimeFrameGPU->getTracklets()[iLayer].size(), + mTimeFrameGPU->getNTracklets()[iLayer], iLayer, nullptr, mTimeFrameGPU->getDeviceArrayCellsLUT(), @@ -196,7 +196,7 @@ void TrackerTraitsGPU::computeCellsHybrid(const int iteration) mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(), mTimeFrameGPU->getDeviceArrayTracklets(), mTimeFrameGPU->getDeviceArrayTrackletsLUT(), - mTimeFrameGPU->getTracklets()[iLayer].size(), + mTimeFrameGPU->getNTracklets()[iLayer], iLayer, mTimeFrameGPU->getDeviceCells()[iLayer], mTimeFrameGPU->getDeviceArrayCellsLUT(), @@ -220,7 +220,7 @@ void TrackerTraitsGPU::findCellsNeighboursHybrid(const int iteration) auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance(); std::vector>> cellsNeighboursLayer(mTrkParams[iteration].CellsPerRoad() - 1); for (int iLayer{0}; iLayer < mTrkParams[iteration].CellsPerRoad() - 1; ++iLayer) { - const int nextLayerCellsNum{static_cast(mTimeFrameGPU->getNCellsDevice()[iLayer + 1])}; + const int nextLayerCellsNum{static_cast(mTimeFrameGPU->getNCells()[iLayer + 1])}; mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].clear(); mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].resize(nextLayerCellsNum, 0); @@ -283,7 +283,7 @@ void TrackerTraitsGPU::findRoads(const int iteration) std::vector lastCellId, updatedCellId; std::vector lastCellSeed, updatedCellSeed; - processNeighbours(startLayer, startLevel, mTimeFrame->getCells()[startLayer], lastCellId, updatedCellSeed, updatedCellId); + processNeighbours(startLayer, startLevel, mTimeFrameGPU->getCells()[startLayer], lastCellId, updatedCellSeed, updatedCellId); int level = startLevel; for (int iLayer{startLayer - 1}; iLayer > 0 && level > 2; --iLayer) { @@ -337,8 +337,8 @@ void TrackerTraitsGPU::findRoads(const int iteration) if (track.getClusterIndex(iLayer) == UnusedIndex) { continue; } - nShared += int(mTimeFrame->isClusterUsed(iLayer, track.getClusterIndex(iLayer))); - isFirstShared |= !iLayer && mTimeFrame->isClusterUsed(iLayer, track.getClusterIndex(iLayer)); + nShared += int(mTimeFrameGPU->isClusterUsed(iLayer, track.getClusterIndex(iLayer))); + isFirstShared |= !iLayer && mTimeFrameGPU->isClusterUsed(iLayer, track.getClusterIndex(iLayer)); } if (nShared > mTrkParams[0].ClusterSharing) { @@ -350,8 +350,8 @@ void TrackerTraitsGPU::findRoads(const int iteration) if (track.getClusterIndex(iLayer) == UnusedIndex) { continue; } - mTimeFrame->markUsedCluster(iLayer, track.getClusterIndex(iLayer)); - int currentROF = mTimeFrame->getClusterROF(iLayer, track.getClusterIndex(iLayer)); + mTimeFrameGPU->markUsedCluster(iLayer, track.getClusterIndex(iLayer)); + int currentROF = mTimeFrameGPU->getClusterROF(iLayer, track.getClusterIndex(iLayer)); for (int iR{0}; iR < 3; ++iR) { if (rofs[iR] == INT_MAX) { rofs[iR] = currentROF; @@ -367,7 +367,7 @@ void TrackerTraitsGPU::findRoads(const int iteration) if (rofs[1] != INT_MAX) { track.setNextROFbit(); } - mTimeFrame->getTracks(std::min(rofs[0], rofs[1])).emplace_back(track); + mTimeFrameGPU->getTracks(std::min(rofs[0], rofs[1])).emplace_back(track); } } mTimeFrameGPU->loadUsedClustersDevice(); diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu index c1e754b3c5b45..00c26a67bcb51 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu @@ -192,6 +192,17 @@ GPUd() o2::track::TrackParCov buildTrackSeed(const Cluster& cluster1, 0.f, 0.f, 0.f, 0.f, sg2q2pt}); } +// auto sort_tracklets = [] GPUhdni()(const Tracklet& a, const Tracklet& b) { return a.firstClusterIndex < b.firstClusterIndex || (a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex < b.secondClusterIndex); }; +// auto equal_tracklets = [] GPUhdni()(const Tracklet& a, const Tracklet& b) { return a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex == b.secondClusterIndex; }; + +struct sort_tracklets { + GPUhd() bool operator()(const Tracklet& a, const Tracklet& b) { return a.firstClusterIndex < b.firstClusterIndex || (a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex < b.secondClusterIndex); } +}; + +struct equal_tracklets { + GPUhd() bool operator()(const Tracklet& a, const Tracklet& b) { return a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex == b.secondClusterIndex; } +}; + template struct pair_to_first : public thrust::unary_function, T1> { GPUhd() int operator()(const gpuPair& a) const @@ -686,10 +697,7 @@ GPUg() void compileTrackletsLookupTableKernel(const Tracklet* tracklets, const int nTracklets) { for (int currentTrackletIndex = blockIdx.x * blockDim.x + threadIdx.x; currentTrackletIndex < nTracklets; currentTrackletIndex += blockDim.x * gridDim.x) { - auto& tracklet{tracklets[currentTrackletIndex]}; - if (tracklet.firstClusterIndex >= 0) { - atomicAdd(trackletsLookUpTable + tracklet.firstClusterIndex, 1); - } + atomicAdd(&trackletsLookUpTable[tracklets[currentTrackletIndex].firstClusterIndex], 1); } } @@ -808,7 +816,10 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils, const unsigned char** usedClusters, const int** clustersIndexTables, Tracklet** tracklets, + gsl::span spanTracklets, + gsl::span nTracklets, int** trackletsLUTs, + gsl::span trackletsLUTsHost, const int iteration, const float NSigmaCut, std::vector& phiCuts, @@ -848,8 +859,31 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils, resolutions[iLayer], radii[iLayer + 1] - radii[iLayer], mulScatAng[iLayer]); - gpuCheckError(cudaPeekAtLastError()); - gpuCheckError(cudaDeviceSynchronize()); + thrust::device_ptr tracklets_ptr(spanTracklets[iLayer]); + thrust::sort(thrust::device, tracklets_ptr, tracklets_ptr + nTracklets[iLayer], gpu::sort_tracklets()); + auto unique_end = thrust::unique(thrust::device, tracklets_ptr, tracklets_ptr + nTracklets[iLayer], gpu::equal_tracklets()); + nTracklets[iLayer] = unique_end - tracklets_ptr; + LOGP(info, "=> {} {}", nTracklets[iLayer], unique_end - tracklets_ptr); + if (iLayer > 0) { + gpuCheckError(cudaMemset(trackletsLUTsHost[iLayer], 0, nClusters[iLayer] * sizeof(int))); + gpu::compileTrackletsLookupTableKernel<<>>(spanTracklets[iLayer], trackletsLUTsHost[iLayer], nTracklets[iLayer]); + void* d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage + temp_storage_bytes, // temp_storage_bytes + trackletsLUTsHost[iLayer], // d_in + trackletsLUTsHost[iLayer], // d_out + nClusters[iLayer] + 1, // num_items + 0)); // NOLINT: this is the offset of the sum, not a pointer + discardResult(cudaMalloc(&d_temp_storage, temp_storage_bytes)); + gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage + temp_storage_bytes, // temp_storage_bytes + trackletsLUTsHost[iLayer], // d_in + trackletsLUTsHost[iLayer], // d_out + nClusters[iLayer] + 1, // num_items + 0)); // NOLINT: this is the offset of the sum, not a pointer + gpuCheckError(cudaFree(d_temp_storage)); + } } } @@ -1127,7 +1161,10 @@ template void computeTrackletsInROFsHandler<7>(const IndexTableUtils* utils, const unsigned char** usedClusters, const int** clustersIndexTables, Tracklet** tracklets, + gsl::span spanTracklets, + gsl::span nTracklets, int** trackletsLUTs, + gsl::span trackletsLUTsHost, const int iteration, const float NSigmaCut, std::vector& phiCuts, diff --git a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx index 510d819776147..cfeb2cbc73a8b 100644 --- a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx +++ b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx @@ -193,56 +193,17 @@ void TrackerTraits::computeLayerTracklets(const int iteration, int iROFslice, in } } } - // if (rof0 == 81) { - // printf("CPU layer: %d -> %f %f %f %f %f %f %f %f\n", - // iLayer, - // mTrkParams[iteration].NSigmaCut, - // tf->getPhiCut(iLayer), - // mTrkParams[iteration].PVres, - // tf->getMinR(iLayer + 1), - // tf->getMaxR(iLayer + 1), - // tf->getPositionResolution(iLayer), - // meanDeltaR, - // tf->getMSangle(iLayer)); - // } } } if (!tf->checkMemory(mTrkParams[iteration].MaxMemory)) { return; } - // for (auto iLayer{0}; iLayer < tf->getTracklets().size(); ++iLayer) { - // std::cout << "tracklets layer " << iLayer << ": " << tf->getTracklets()[iLayer].size() << std::endl; - // } - - // for (auto iLayer{0}; iLayer < tf->getTrackletsLookupTable().size(); ++iLayer) { - // auto lut = tf->getTrackletsLookupTable()[iLayer]; - // for (unsigned int iC{0}; iC < lut.size(); ++iC) { - // if (!(iC % 150)) { - // printf("\n row %d: ===> %d/%d\t", iLayer, iC, (int)lut.size()); - // } - // printf("%d\t", lut[iC]); - // } - // } - - // for (auto rofId{0}; rofId < 2304; ++rofId) { - // int nClus = tf->getClustersOnLayer(rofId, 1).size(); - // if (!nClus) { - // continue; - // } - // printf("rof: %d (%d) ==> ", rofId, nClus); - - // for (int iC{0}; iC < nClus; ++iC) { - // int nT = tf->getTrackletsLookupTable()[0][tf->getSortedIndex(rofId, 1, iC)]; - // printf("%d\t", nT); - // } - // printf("\n"); - // } - #pragma omp parallel for num_threads(mNThreads) for (int iLayer = 0; iLayer < mTrkParams[iteration].CellsPerRoad(); ++iLayer) { /// Sort tracklets auto& trkl{tf->getTracklets()[iLayer + 1]}; + auto oldsize{trkl.size()}; std::sort(trkl.begin(), trkl.end(), [](const Tracklet& a, const Tracklet& b) { return a.firstClusterIndex < b.firstClusterIndex || (a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex < b.secondClusterIndex); }); @@ -265,6 +226,7 @@ void TrackerTraits::computeLayerTracklets(const int iteration, int iROFslice, in /// Compute LUT std::exclusive_scan(lut.begin(), lut.end(), lut.begin(), 0); lut.push_back(trkl.size()); + LOGP(info, "CPU layer {} -> old size: {} - new size: {}", iLayer, oldsize, trkl.size()); } /// Layer 0 is done outside the loop std::sort(tf->getTracklets()[0].begin(), tf->getTracklets()[0].end(), [](const Tracklet& a, const Tracklet& b) {