From 51e7404c89aeb0b459324baac7c21122e10b432f Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Wed, 20 Nov 2024 18:20:31 +0100 Subject: [PATCH] Checkpointing --- .../GPU/ITStrackingGPU/TimeFrameGPU.h | 13 +- .../GPU/ITStrackingGPU/TrackingKernels.h | 12 +- .../ITS/tracking/GPU/ITStrackingGPU/Utils.h | 16 +- .../ITS/tracking/GPU/cuda/TimeFrameGPU.cu | 64 +++- .../tracking/GPU/cuda/TrackerTraitsGPU.cxx | 16 +- .../ITS/tracking/GPU/cuda/TrackingKernels.cu | 295 ++++++++++-------- .../tracking/include/ITStracking/TimeFrame.h | 5 +- 7 files changed, 269 insertions(+), 152 deletions(-) diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h index 6b5d32dc1c17a..8f2ba9dd6ea58 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h @@ -51,10 +51,14 @@ class TimeFrameGPU : public TimeFrame void initialise(const int, const TrackingParameters&, const int, IndexTableUtils* utils = nullptr, const TimeFrameGPUParameters* pars = nullptr); void initDevice(IndexTableUtils*, const TrackingParameters& trkParam, const TimeFrameGPUParameters&, const int, const int); void initDeviceSAFitting(); + void loadIndexTableUtils(const int); void loadTrackingFrameInfoDevice(const int); void loadUnsortedClustersDevice(const int); void loadClustersDevice(const int); - void loadROframeClustersDevice(const int iteration); + void loadClustersIndexTables(const int iteration); + void createUsedClustersDevice(const int); + void loadUsedClustersDevice(); + void loadROframeClustersDevice(const int); void loadMultiplicityCutMask(const int); void loadVertices(const int); @@ -112,6 +116,8 @@ class TimeFrameGPU : public TimeFrame const TrackingFrameInfo** getDeviceArrayTrackingFrameInfo() const { return mTrackingFrameInfoDeviceArray; } const Cluster** getDeviceArrayClusters() const { return mClustersDeviceArray; } const Cluster** getDeviceArrayUnsortedClusters() const { return mUnsortedClustersDeviceArray; } + const int** getDeviceArrayClustersIndexTables() const { return mClustersIndexTablesDeviceArray; } + const unsigned char** getDeviceArrayUsedClusters() const { return mUsedClustersDeviceArray; } const int** getDeviceROframeClusters() const { return mROFrameClustersDeviceArray; } const Tracklet** getDeviceArrayTracklets() const { return mTrackletsDeviceArray; } const int** getDeviceArrayTrackletsLUT() const { return mTrackletsLUTDeviceArray; } @@ -148,7 +154,6 @@ class TimeFrameGPU : public TimeFrame // Device pointers StaticTrackingParameters* mTrackingParamsDevice; IndexTableUtils* mIndexTableUtilsDevice; - std::array mUsedClustersDevice; // Hybrid pref uint8_t* mMultMaskDevice; @@ -156,9 +161,13 @@ class TimeFrameGPU : public TimeFrame int* mROFramesPVDevice; std::array mClustersDevice; std::array mUnsortedClustersDevice; + std::array mClustersIndexTablesDevice; + std::array mUsedClustersDevice; std::array mROFramesClustersDevice; const Cluster** mClustersDeviceArray; const Cluster** mUnsortedClustersDeviceArray; + const int** mClustersIndexTablesDeviceArray; + const unsigned char** mUsedClustersDeviceArray; const int** mROFrameClustersDeviceArray; std::array mTrackletsDevice; const Tracklet** mTrackletsDeviceArray; diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h index 5b70d571b9b52..0496635f8898b 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h @@ -51,7 +51,8 @@ GPUg() void fitTrackSeedsKernel( } // namespace gpu template -void computeTrackletsInRofsHandler(const uint8_t* multMask, +void computeTrackletsInROFsHandler(const IndexTableUtils* utils, + const uint8_t* multMask, const int startROF, const int endROF, const int maxROF, @@ -62,6 +63,15 @@ void computeTrackletsInRofsHandler(const uint8_t* multMask, const int nVertices, const Cluster** clusters, const int** ROFClusters, + const unsigned char** usedClusters, + const int** clustersIndexTables, + const int iteration, + const float NSigmaCut, + std::vector& phiCuts, + const float resolutionPV, + std::vector& minR, + std::vector& maxR, + std::vector& resolutions, std::vector& radii, std::vector& mulScatAng, const int nBlocks, diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h index cc45e24a8cbdb..a88e51742e84a 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h @@ -39,9 +39,9 @@ struct gpuSpan { using ref = T&; GPUd() gpuSpan() : _data(nullptr), _size(0) {} - GPUd() gpuSpan(ptr data, std::size_t dim) : _data(data), _size(dim) {} - GPUd() ref operator[](std::size_t idx) const { return _data[idx]; } - GPUd() std::size_t size() const { return _size; } + GPUd() gpuSpan(ptr data, unsigned int dim) : _data(data), _size(dim) {} + GPUd() ref operator[](unsigned int idx) const { return _data[idx]; } + GPUd() unsigned int size() const { return _size; } GPUd() bool empty() const { return _size == 0; } GPUd() ref front() const { return _data[0]; } GPUd() ref back() const { return _data[_size - 1]; } @@ -50,7 +50,7 @@ struct gpuSpan { protected: ptr _data; - std::size_t _size; + unsigned int _size; }; template @@ -60,10 +60,10 @@ struct gpuSpan { using ref = const T&; GPUd() gpuSpan() : _data(nullptr), _size(0) {} - GPUd() gpuSpan(ptr data, std::size_t dim) : _data(data), _size(dim) {} + GPUd() gpuSpan(ptr data, unsigned int dim) : _data(data), _size(dim) {} GPUd() gpuSpan(const gpuSpan& other) : _data(other._data), _size(other._size) {} - GPUd() ref operator[](std::size_t idx) const { return _data[idx]; } - GPUd() std::size_t size() const { return _size; } + GPUd() ref operator[](unsigned int idx) const { return _data[idx]; } + GPUd() unsigned int size() const { return _size; } GPUd() bool empty() const { return _size == 0; } GPUd() ref front() const { return _data[0]; } GPUd() ref back() const { return _data[_size - 1]; } @@ -72,7 +72,7 @@ struct gpuSpan { protected: ptr _data; - std::size_t _size; + unsigned int _size; }; enum class Task { diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu index 0db970b2361ab..0b30d7af99246 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu @@ -92,6 +92,19 @@ void TimeFrameGPU::setDevicePropagator(const o2::base::PropagatorImpl +void TimeFrameGPU::loadIndexTableUtils(const int iteration) +{ + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading indextable utils"); + if (!iteration) { + LOGP(debug, "gpu-allocation: allocating IndexTableUtils buffer, for {} MB.", sizeof(IndexTableUtils) / MB); + allocMemAsync(reinterpret_cast(&mIndexTableUtilsDevice), sizeof(IndexTableUtils), nullptr, getExtAllocator()); + } + LOGP(debug, "gpu-transfer: loading IndexTableUtils object, for {} MB.", sizeof(IndexTableUtils) / MB); + checkGPUError(cudaMemcpyAsync(mIndexTableUtilsDevice, &mIndexTableUtils, sizeof(IndexTableUtils), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); +} + template void TimeFrameGPU::loadUnsortedClustersDevice(const int iteration) { @@ -128,13 +141,56 @@ void TimeFrameGPU::loadClustersDevice(const int iteration) } } +template +void TimeFrameGPU::loadClustersIndexTables(const int iteration) +{ + if (!iteration) { + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading sorted clusters"); + for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { + LOGP(info, "gpu-transfer: loading clusters indextable for layer {} with {} elements, for {} MB.", iLayer, mIndexTables[iLayer].size(), mIndexTables[iLayer].size() * sizeof(int) / MB); + allocMemAsync(reinterpret_cast(&mClustersIndexTablesDevice[iLayer]), mIndexTables[iLayer].size() * sizeof(int), nullptr, getExtAllocator()); + checkGPUError(cudaMemcpyAsync(mClustersIndexTablesDevice[iLayer], mIndexTables[iLayer].data(), mIndexTables[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + } + allocMemAsync(reinterpret_cast(&mClustersIndexTablesDeviceArray), nLayers * sizeof(int), nullptr, getExtAllocator()); + checkGPUError(cudaMemcpyAsync(mClustersIndexTablesDeviceArray, mClustersIndexTablesDevice.data(), nLayers * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); + } +} + +template +void TimeFrameGPU::createUsedClustersDevice(const int iteration) +{ + if (!iteration) { + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating used clusters flags"); + for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { + LOGP(debug, "gpu-transfer: creating {} used clusters flags on layer {}, for {} MB.", mUsedClusters[iLayer].size(), iLayer, mUsedClusters[iLayer].size() * sizeof(unsigned char) / MB); + allocMemAsync(reinterpret_cast(&mUsedClustersDevice[iLayer]), mUsedClusters[iLayer].size() * sizeof(unsigned char), nullptr, getExtAllocator()); + checkGPUError(cudaMemsetAsync(mUsedClustersDevice[iLayer], 0, mUsedClusters[iLayer].size() * sizeof(unsigned char), mGpuStreams[0].get())); + } + allocMemAsync(reinterpret_cast(&mUsedClustersDeviceArray), nLayers * sizeof(unsigned char*), nullptr, getExtAllocator()); + checkGPUError(cudaMemcpyAsync(mUsedClustersDeviceArray, mUsedClustersDevice.data(), nLayers * sizeof(unsigned char*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); + } +} + +template +void TimeFrameGPU::loadUsedClustersDevice() +{ + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating used clusters flags"); + for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { + LOGP(debug, "gpu-transfer: loading {} used clusters flags on layer {}, for {} MB.", mUsedClusters[iLayer].size(), iLayer, mClusters[iLayer].size() * sizeof(unsigned char) / MB); + checkGPUError(cudaMemcpyAsync(mUsedClustersDevice[iLayer], mUsedClusters[iLayer].data(), mUsedClusters[iLayer].size() * sizeof(unsigned char), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + } + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); +} + template void TimeFrameGPU::loadROframeClustersDevice(const int iteration) { if (!iteration) { START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading ROframe clusters"); for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { - LOGP(info, "gpu-transfer: loading {} ROframe clusters info on layer {}, for {} MB.", mROFramesClusters[iLayer].size(), iLayer, mROFramesClusters[iLayer].size() * sizeof(int) / MB); + LOGP(debug, "gpu-transfer: loading {} ROframe clusters info on layer {}, for {} MB.", mROFramesClusters[iLayer].size(), iLayer, mROFramesClusters[iLayer].size() * sizeof(int) / MB); allocMemAsync(reinterpret_cast(&mROFramesClustersDevice[iLayer]), mROFramesClusters[iLayer].size() * sizeof(int), nullptr, getExtAllocator()); checkGPUError(cudaMemcpyAsync(mROFramesClustersDevice[iLayer], mROFramesClusters[iLayer].data(), mROFramesClusters[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } @@ -167,7 +223,7 @@ void TimeFrameGPU::loadMultiplicityCutMask(const int iteration) { if (!iteration) { START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading multiplicity cut mask"); - LOGP(info, "gpu-transfer: loading multiplicity cut mask with {} elements, for {} MB.", mMultiplicityCutMask.size(), mMultiplicityCutMask.size() * sizeof(bool) / MB); + LOGP(debug, "gpu-transfer: loading multiplicity cut mask with {} elements, for {} MB.", mMultiplicityCutMask.size(), mMultiplicityCutMask.size() * sizeof(bool) / MB); allocMemAsync(reinterpret_cast(&mMultMaskDevice), mMultiplicityCutMask.size() * sizeof(uint8_t), nullptr, getExtAllocator()); checkGPUError(cudaMemcpyAsync(mMultMaskDevice, mMultiplicityCutMask.data(), mMultiplicityCutMask.size() * sizeof(uint8_t), cudaMemcpyHostToDevice, mGpuStreams[0].get())); STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); @@ -179,10 +235,10 @@ void TimeFrameGPU::loadVertices(const int iteration) { if (!iteration) { START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading seeding vertices"); - LOGP(info, "gpu-transfer: loading {} ROframes vertices, for {} MB.", mROFramesPV.size(), mROFramesPV.size() * sizeof(int) / MB); + LOGP(debug, "gpu-transfer: loading {} ROframes vertices, for {} MB.", mROFramesPV.size(), mROFramesPV.size() * sizeof(int) / MB); allocMemAsync(reinterpret_cast(&mROFramesPVDevice), mROFramesPV.size() * sizeof(int), nullptr, getExtAllocator()); checkGPUError(cudaMemcpyAsync(mROFramesPVDevice, mROFramesPV.data(), mROFramesPV.size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get())); - LOGP(info, "gpu-transfer: loading {} seeding vertices, for {} MB.", mPrimaryVertices.size(), mPrimaryVertices.size() * sizeof(Vertex) / MB); + LOGP(debug, "gpu-transfer: loading {} seeding vertices, for {} MB.", mPrimaryVertices.size(), mPrimaryVertices.size() * sizeof(Vertex) / MB); allocMemAsync(reinterpret_cast(&mPrimaryVerticesDevice), mPrimaryVertices.size() * sizeof(Vertex), nullptr, getExtAllocator()); checkGPUError(cudaMemcpyAsync(mPrimaryVerticesDevice, mPrimaryVertices.data(), mPrimaryVertices.size() * sizeof(Vertex), cudaMemcpyHostToDevice, mGpuStreams[0].get())); STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx index 8fb3f628cacc0..b83caf5b5b849 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx @@ -31,10 +31,13 @@ void TrackerTraitsGPU::initialiseTimeFrame(const int iteration) mTimeFrameGPU->initialise(iteration, mTrkParams[iteration], nLayers); mTimeFrameGPU->loadClustersDevice(iteration); mTimeFrameGPU->loadUnsortedClustersDevice(iteration); + mTimeFrameGPU->loadClustersIndexTables(iteration); mTimeFrameGPU->loadTrackingFrameInfoDevice(iteration); mTimeFrameGPU->loadMultiplicityCutMask(iteration); mTimeFrameGPU->loadVertices(iteration); mTimeFrameGPU->loadROframeClustersDevice(iteration); + mTimeFrameGPU->createUsedClustersDevice(iteration); + mTimeFrameGPU->loadIndexTableUtils(iteration); } template @@ -95,7 +98,8 @@ void TrackerTraitsGPU::computeTrackletsHybrid(const int iteration, int int startROF{mTrkParams[iteration].nROFsPerIterations > 0 ? iROFslice * mTrkParams[iteration].nROFsPerIterations : 0}; int endROF{mTrkParams[iteration].nROFsPerIterations > 0 ? (iROFslice + 1) * mTrkParams[iteration].nROFsPerIterations + mTrkParams[iteration].DeltaROF : mTimeFrameGPU->getNrof()}; - computeTrackletsInRofsHandler(mTimeFrameGPU->getDeviceMultCutMask(), + computeTrackletsInROFsHandler(mTimeFrameGPU->getDeviceIndexTableUtils(), + mTimeFrameGPU->getDeviceMultCutMask(), startROF, endROF, mTimeFrameGPU->getNrof(), @@ -106,6 +110,15 @@ void TrackerTraitsGPU::computeTrackletsHybrid(const int iteration, int mTimeFrameGPU->getPrimaryVerticesNum(), mTimeFrameGPU->getDeviceArrayClusters(), mTimeFrameGPU->getDeviceROframeClusters(), + mTimeFrameGPU->getDeviceArrayUsedClusters(), + mTimeFrameGPU->getDeviceArrayClustersIndexTables(), + iteration, + mTrkParams[iteration].NSigmaCut, + mTimeFrameGPU->getPhiCuts(), + mTrkParams[iteration].PVres, + mTimeFrameGPU->getMinRs(), + mTimeFrameGPU->getMaxRs(), + mTimeFrameGPU->getPositionResolutions(), mTrkParams[iteration].LayerRadii, mTimeFrameGPU->getMSangles(), conf.nBlocks, @@ -324,6 +337,7 @@ void TrackerTraitsGPU::findRoads(const int iteration) mTimeFrame->getTracks(std::min(rofs[0], rofs[1])).emplace_back(track); } } + mTimeFrameGPU->loadUsedClustersDevice(); if (iteration == mTrkParams.size() - 1) { mTimeFrameGPU->unregisterHostMemory(0); } diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu index cacfa70ec2993..110595e2821a9 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu @@ -74,9 +74,36 @@ namespace o2::its { using namespace constants::its2; using Vertex = o2::dataformats::Vertex>; + +GPUd() float Sq(float v) +{ + return v * v; +} + namespace gpu { +GPUd() const int4 getBinsRect(const Cluster& currentCluster, const int layerIndex, + const o2::its::IndexTableUtils& utils, + const float z1, const float z2, float maxdeltaz, float maxdeltaphi) +{ + const float zRangeMin = o2::gpu::CAMath::Min(z1, z2) - maxdeltaz; + const float phiRangeMin = (maxdeltaphi > constants::math::Pi) ? 0.f : currentCluster.phi - maxdeltaphi; + const float zRangeMax = o2::gpu::CAMath::Max(z1, z2) + maxdeltaz; + const float phiRangeMax = (maxdeltaphi > constants::math::Pi) ? constants::math::TwoPi : currentCluster.phi + maxdeltaphi; + + if (zRangeMax < -LayersZCoordinate()[layerIndex + 1] || + zRangeMin > LayersZCoordinate()[layerIndex + 1] || zRangeMin > zRangeMax) { + + return getEmptyBinsRect(); + } + + return int4{o2::gpu::CAMath::Max(0, utils.getZBinIndex(layerIndex + 1, zRangeMin)), + utils.getPhiBinIndex(math_utils::getNormalizedPhi(phiRangeMin)), + o2::gpu::CAMath::Min(ZBins - 1, utils.getZBinIndex(layerIndex + 1, zRangeMax)), + utils.getPhiBinIndex(math_utils::getNormalizedPhi(phiRangeMax))}; +} + GPUd() bool fitTrack(TrackITSExt& track, int start, int end, @@ -199,30 +226,28 @@ struct is_valid_pair { GPUd() gpuSpan getPrimaryVertices(const int rof, const int* roframesPV, - const int nRof, + const int nROF, const uint8_t* mask, const Vertex* vertices) { const int start_pv_id = roframesPV[rof]; - const int stop_rof = rof >= nRof - 1 ? nRof : rof + 1; - size_t delta = mask[rof] ? roframesPV[stop_rof] - start_pv_id : 0; // return empty span if Rof is excluded + const int stop_rof = rof >= nROF - 1 ? nROF : rof + 1; + size_t delta = mask[rof] ? roframesPV[stop_rof] - start_pv_id : 0; // return empty span if ROF is excluded return gpuSpan(&vertices[start_pv_id], delta); }; GPUd() gpuSpan getClustersOnLayer(const int rof, - const int** roframesClus, + const int totROFs, const int layer, - const int nRof, + const int** roframesClus, const Cluster** clusters) { - const int start_clus_id{roframesClus[layer][rof]}; - const int stop_rof = rof >= nRof - 1 ? nRof : rof + 1; - const int delta = roframesClus[layer][stop_rof] - start_clus_id; - printf("\t\t\t r: %d nr: %d rfci: %d d: %d\n", rof, nRof, roframesClus[layer][rof], delta); - if (rof < 0 || rof >= nRof) { + if (rof < 0 || rof >= totROFs) { return gpuSpan(); } - + const int start_clus_id{roframesClus[layer][rof]}; + const int stop_rof = rof >= totROFs - 1 ? totROFs : rof + 1; + const unsigned int delta = roframesClus[layer][stop_rof] - start_clus_id; return gpuSpan(&(clusters[layer][start_clus_id]), delta); } @@ -424,45 +449,45 @@ GPUg() void computeLayerCellsKernel( } } -template +template GPUg() void computeLayerTrackletsMultiROFKernel( + const IndexTableUtils* utils, const uint8_t* multMask, const int layerIndex, const int startROF, const int endROF, - const int maxRof, - const int deltaRof, + const int totalROFs, + const int deltaROF, const Vertex* vertices, const int* rofPV, const int nVertices, const int vertexId, - const Cluster** clusters, // input data rof0 - const int** ROFClusters, // Number of clusters on layers per ROF - // const int* roFrameClustersNextLayer, // Number of clusters on layer 1 per ROF - // const int* indexTablesNext, // input data rof0-delta getNphiBins()}; - // const int zBins{utils->getNzBins()}; - for (unsigned int iRof{blockIdx.x}; iRof < endROF - startROF; iRof += gridDim.x) { - auto rof0 = iRof + startROF; - auto primaryVertices = getPrimaryVertices(rof0, rofPV, maxRof, multMask, vertices); + const int phiBins{utils->getNphiBins()}; + const int zBins{utils->getNzBins()}; + for (unsigned int iROF{blockIdx.x}; iROF < endROF - startROF; iROF += gridDim.x) { + const int rof0 = iROF + startROF; + auto primaryVertices = getPrimaryVertices(rof0, rofPV, totalROFs, multMask, vertices); const auto startVtx{vertexId >= 0 ? vertexId : 0}; const auto endVtx{vertexId >= 0 ? o2::gpu::CAMath::Min(vertexId + 1, static_cast(primaryVertices.size())) : static_cast(primaryVertices.size())}; - auto minRof = o2::gpu::CAMath::Max(startROF, static_cast(rof0 - deltaRof)); - auto maxRof = o2::gpu::CAMath::Min(endROF - 1, static_cast(rof0 + deltaRof)); - auto clustersCurrentLayer = getClustersOnLayer(iRof, ROFClusters, layerIndex, maxRof, clusters); - if (threadIdx.x == 0) { - // printf("> l: %d r: %d rc: %d s: %d e: %d \n", layerIndex, iRof, ROFClusters[layerIndex][iRof], clustersCurrentLayer.size(), clustersCurrentLayer.empty()); - } + auto minROF = o2::gpu::CAMath::Max(startROF, static_cast(rof0 - deltaROF)); + auto maxROF = o2::gpu::CAMath::Min(endROF - 1, static_cast(rof0 + deltaROF)); + auto clustersCurrentLayer = getClustersOnLayer(rof0, totalROFs, layerIndex, ROFClusters, clusters); if (clustersCurrentLayer.empty()) { continue; } @@ -470,79 +495,72 @@ GPUg() void computeLayerTrackletsMultiROFKernel( for (int currentClusterIndex = threadIdx.x; currentClusterIndex < clustersCurrentLayer.size(); currentClusterIndex += blockDim.x) { unsigned int storedTracklets{0}; auto currentCluster{clustersCurrentLayer[currentClusterIndex]}; - if (threadIdx.x == 0) { - printf("rof: %d has %zu clusters on layer %d\n", rof0, clustersCurrentLayer.size(), layerIndex); + const int currentSortedIndex{ROFClusters[layerIndex][rof0] + currentClusterIndex}; + if (usedClusters[layerIndex][currentSortedIndex]) { + continue; } - // const int currentSortedIndex{roFrameClustersCurrentLayer[rof0] + currentClusterIndex}; - // const int currentSortedIndexChunk{currentSortedIndex - roFrameClustersCurrentLayer[startROF]}; - // if (usedClustersLayer[currentSortedIndex]) { - // continue; - // } - // - // int minRof = (rof0 >= trkPars->DeltaROF) ? rof0 - trkPars->DeltaROF : 0; - // int maxRof = (rof0 == maxRofs - trkPars->DeltaROF) ? rof0 : rof0 + trkPars->DeltaROF; // works with delta = {0, 1} - // const float inverseR0{1.f / currentCluster.radius}; - // - // for (int iPrimaryVertex{0}; iPrimaryVertex < nVerticesRof0; iPrimaryVertex++) { - // const auto& primaryVertex{vertices[nVertices[rof0] + iPrimaryVertex]}; - // const float resolution{Sqrt(Sq(trkPars->PVres) / primaryVertex.getNContributors() + Sq(positionResolution))}; - // const float tanLambda{(currentCluster.zCoordinate - primaryVertex.getZ()) * inverseR0}; - // const float zAtRmin{tanLambda * (minR - currentCluster.radius) + currentCluster.zCoordinate}; - // const float zAtRmax{tanLambda * (maxR - currentCluster.radius) + currentCluster.zCoordinate}; - // const float sqInverseDeltaZ0{1.f / (Sq(currentCluster.zCoordinate - primaryVertex.getZ()) + 2.e-8f)}; /// protecting from overflows adding the detector resolution - // const float sigmaZ{o2::gpu::CAMath::Sqrt(Sq(resolution) * Sq(tanLambda) * ((Sq(inverseR0) + sqInverseDeltaZ0) * Sq(meanDeltaR) + 1.f) + Sq(meanDeltaR * mSAngle))}; - // - // const int4 selectedBinsRect{getBinsRect(currentCluster, layerIndex, *utils, zAtRmin, zAtRmax, sigmaZ * trkPars->NSigmaCut, phiCut)}; - // - // if (selectedBinsRect.x == 0 && selectedBinsRect.y == 0 && selectedBinsRect.z == 0 && selectedBinsRect.w == 0) { - // continue; - // } - // int phiBinsNum{selectedBinsRect.w - selectedBinsRect.y + 1}; - // if (phiBinsNum < 0) { - // phiBinsNum += trkPars->PhiBins; - // } - // const int tableSize{phiBins * zBins + 1}; - // for (int rof1{minRof}; rof1 <= maxRof; ++rof1) { - // auto nClustersNext{roFrameClustersNextLayer[rof1 + 1] - roFrameClustersNextLayer[rof1]}; - // if (!nClustersNext) { // number of clusters on next layer > 0 - // continue; - // } - // for (int iPhiCount{0}; iPhiCount < phiBinsNum; iPhiCount++) { - // int iPhiBin = (selectedBinsRect.y + iPhiCount) % trkPars->PhiBins; - // const int firstBinIndex{utils->getBinIndex(selectedBinsRect.x, iPhiBin)}; - // const int maxBinIndex{firstBinIndex + selectedBinsRect.z - selectedBinsRect.x + 1}; - // const int firstRowClusterIndex = indexTablesNext[(rof1 - startROF) * tableSize + firstBinIndex]; - // const int maxRowClusterIndex = indexTablesNext[(rof1 - startROF) * tableSize + maxBinIndex]; - // for (int iNextCluster{firstRowClusterIndex}; iNextCluster < maxRowClusterIndex; ++iNextCluster) { - // if (iNextCluster >= nClustersNext) { - // break; - // } - // auto nextClusterIndex{roFrameClustersNextLayer[rof1] - roFrameClustersNextLayer[startROF] + iNextCluster}; - // const Cluster& nextCluster{clustersNextLayer[nextClusterIndex]}; - // if (usedClustersNextLayer[nextCluster.clusterId]) { - // continue; - // } - // const float deltaPhi{o2::gpu::CAMath::Abs(currentCluster.phi - nextCluster.phi)}; - // const float deltaZ{o2::gpu::CAMath::Abs(tanLambda * (nextCluster.radius - currentCluster.radius) + currentCluster.zCoordinate - nextCluster.zCoordinate)}; - // - // if ((deltaZ / sigmaZ < trkPars->NSigmaCut && (deltaPhi < phiCut || o2::gpu::CAMath::Abs(deltaPhi - constants::math::TwoPi) < phiCut))) { - // const float phi{ATan2(currentCluster.yCoordinate - nextCluster.yCoordinate, currentCluster.xCoordinate - nextCluster.xCoordinate)}; - // const float tanL{(currentCluster.zCoordinate - nextCluster.zCoordinate) / (currentCluster.radius - nextCluster.radius)}; - // const unsigned int stride{currentClusterIndex * maxTrackletsPerCluster}; - // if (storedTracklets < maxTrackletsPerCluster) { - // new (trackletsRof0 + stride + storedTracklets) Tracklet{currentSortedIndexChunk, nextClusterIndex, tanL, phi, static_cast(rof0), static_cast(rof1)}; - // } - // else { - // printf("its-gpu-tracklet-finder: on rof %d layer: %d: found more tracklets (%d) than maximum allowed per cluster. This is lossy!\n", rof0, layerIndex, storedTracklets); - // } - // ++storedTracklets; - // } - // } - // } - // } - // } - // */ + const float inverseR0{1.f / currentCluster.radius}; + for (int iV{startVtx}; iV < endVtx; ++iV) { + auto& primaryVertex{primaryVertices[iV]}; + if (primaryVertex.isFlagSet(2) && iteration != 3) { + continue; + } + const float resolution = o2::gpu::CAMath::Sqrt(Sq(resolutionPV) / primaryVertex.getNContributors() + Sq(positionResolution)); + const float tanLambda{(currentCluster.zCoordinate - primaryVertex.getZ()) * inverseR0}; + const float zAtRmin{tanLambda * (minR - currentCluster.radius) + currentCluster.zCoordinate}; + const float zAtRmax{tanLambda * (maxR - currentCluster.radius) + currentCluster.zCoordinate}; + const float sqInverseDeltaZ0{1.f / (Sq(currentCluster.zCoordinate - primaryVertex.getZ()) + 2.e-8f)}; /// protecting from overflows adding the detector resolution + const float sigmaZ{o2::gpu::CAMath::Sqrt(Sq(resolution) * Sq(tanLambda) * ((Sq(inverseR0) + sqInverseDeltaZ0) * Sq(meanDeltaR) + 1.f) + Sq(meanDeltaR * MSAngle))}; + const int4 selectedBinsRect{getBinsRect(currentCluster, layerIndex, *utils, zAtRmin, zAtRmax, sigmaZ * NSigmaCut, phiCut)}; + if (selectedBinsRect.x == 0 && selectedBinsRect.y == 0 && selectedBinsRect.z == 0 && selectedBinsRect.w == 0) { + continue; + } + int phiBinsNum{selectedBinsRect.w - selectedBinsRect.y + 1}; + + if (phiBinsNum < 0) { + phiBinsNum += phiBins; + } + + const int tableSize{phiBins * zBins + 1}; + for (int rof1{minROF}; rof1 <= maxROF; ++rof1) { + auto clustersNextLayer = getClustersOnLayer(rof1, totalROFs, layerIndex + 1, ROFClusters, clusters); + if (clustersNextLayer.empty()) { + continue; + } + for (int iPhiCount{0}; iPhiCount < phiBinsNum; iPhiCount++) { + int iPhiBin = (selectedBinsRect.y + iPhiCount) % PhiBins; + const int firstBinIndex{utils->getBinIndex(selectedBinsRect.x, iPhiBin)}; + const int maxBinIndex{firstBinIndex + selectedBinsRect.z - selectedBinsRect.x + 1}; + const int firstRowClusterIndex = indexTables[layerIndex + 1][(rof1 - startROF) * tableSize + firstBinIndex]; + const int maxRowClusterIndex = indexTables[layerIndex + 1][(rof1 - startROF) * tableSize + maxBinIndex]; + for (int iNextCluster{firstRowClusterIndex}; iNextCluster < maxRowClusterIndex; ++iNextCluster) { + if (iNextCluster >= clustersNextLayer.size()) { + break; + } + const Cluster& nextCluster{clustersNextLayer[iNextCluster]}; + if (usedClusters[layerIndex + 1][nextCluster.clusterId]) { + continue; + } + const float deltaPhi{o2::gpu::CAMath::Abs(currentCluster.phi - nextCluster.phi)}; + const float deltaZ{o2::gpu::CAMath::Abs(tanLambda * (nextCluster.radius - currentCluster.radius) + + currentCluster.zCoordinate - nextCluster.zCoordinate)}; + if (deltaZ / sigmaZ < NSigmaCut && (deltaPhi < phiCut || o2::gpu::CAMath::Abs(deltaPhi - constants::math::TwoPi) < phiCut)) { + // if (layerIndex > 0) { + if constexpr (initRun) { + // trackletsLUT[currentSortedIndex]++; // we need l0 as well for usual exclusive sums. + } else { + // } + const float phi{o2::gpu::CAMath::ATan2(currentCluster.yCoordinate - nextCluster.yCoordinate, currentCluster.xCoordinate - nextCluster.xCoordinate)}; + const float tanL{(currentCluster.zCoordinate - nextCluster.zCoordinate) / (currentCluster.radius - nextCluster.radius)}; + // tf->getTracklets()[layerIndex].emplace_back(currentSortedIndex, tf->getSortedIndex(rof1, layerIndex + 1, iNextCluster), tanL, phi, rof0, rof1); + } + ++storedTracklets; + } + } + } + } + } } } } @@ -550,26 +568,6 @@ GPUg() void computeLayerTrackletsMultiROFKernel( ///////////////////////////////////////// // Debug Kernels ///////////////////////////////////////// -GPUd() const int4 getBinsRect(const Cluster& currentCluster, const int layerIndex, - const o2::its::IndexTableUtils& utils, - const float z1, const float z2, float maxdeltaz, float maxdeltaphi) -{ - const float zRangeMin = o2::gpu::CAMath::Min(z1, z2) - maxdeltaz; - const float phiRangeMin = currentCluster.phi - maxdeltaphi; - const float zRangeMax = o2::gpu::CAMath::Max(z1, z2) + maxdeltaz; - const float phiRangeMax = currentCluster.phi + maxdeltaphi; - - if (zRangeMax < -LayersZCoordinate()[layerIndex + 1] || - zRangeMin > LayersZCoordinate()[layerIndex + 1] || zRangeMin > zRangeMax) { - - return getEmptyBinsRect(); - } - - return int4{o2::gpu::CAMath::Max(0, utils.getZBinIndex(layerIndex + 1, zRangeMin)), - utils.getPhiBinIndex(math_utils::getNormalizedPhi(phiRangeMin)), - o2::gpu::CAMath::Min(ZBins - 1, utils.getZBinIndex(layerIndex + 1, zRangeMax)), - utils.getPhiBinIndex(math_utils::getNormalizedPhi(phiRangeMax))}; -} template GPUd() void pPointer(T* ptr) @@ -697,7 +695,8 @@ GPUg() void removeDuplicateTrackletsEntriesLUTKernel( } // namespace gpu template -void computeTrackletsInRofsHandler(const uint8_t* multMask, +void computeTrackletsInROFsHandler(const IndexTableUtils* utils, + const uint8_t* multMask, const int startROF, const int endROF, const int maxROF, @@ -708,16 +707,23 @@ void computeTrackletsInRofsHandler(const uint8_t* multMask, const int nVertices, const Cluster** clusters, const int** ROFClusters, + const unsigned char** usedClusters, + const int** clustersIndexTables, + const int iteration, + const float NSigmaCut, + std::vector& phiCuts, + const float resolutionPV, + std::vector& minRs, + std::vector& maxRs, + std::vector& resolutions, std::vector& radii, std::vector& mulScatAng, const int nBlocks, const int nThreads) { for (int iLayer = 0; iLayer < nLayers - 1; ++iLayer) { - const auto meanDeltaR = radii[iLayer + 1] - radii[iLayer]; - const auto mSAngle = mulScatAng[iLayer]; - // gpu::printMatrixRow<<<1, 1>>>(iLayer, ROFClusters, maxROF); - gpu::computeLayerTrackletsMultiROFKernel<<<1, 1>>>( + gpu::computeLayerTrackletsMultiROFKernel<<>>( + utils, multMask, iLayer, startROF, @@ -730,8 +736,17 @@ void computeTrackletsInRofsHandler(const uint8_t* multMask, vertexId, clusters, ROFClusters, - meanDeltaR, - mSAngle); + usedClusters, + clustersIndexTables, + iteration, + NSigmaCut, + phiCuts[iLayer], + resolutionPV, + minRs[iLayer + 1], + maxRs[iLayer + 1], + resolutions[iLayer], + radii[iLayer + 1] - radii[iLayer], + mulScatAng[iLayer]); } } @@ -965,7 +980,8 @@ void trackSeedHandler(CellSeed* trackSeeds, gpuCheckError(cudaDeviceSynchronize()); } -template void computeTrackletsInRofsHandler<7>(const uint8_t* multMask, +template void computeTrackletsInROFsHandler<7>(const IndexTableUtils* utils, + const uint8_t* multMask, const int startROF, const int endROF, const int maxROF, @@ -976,6 +992,15 @@ template void computeTrackletsInRofsHandler<7>(const uint8_t* multMask, const int nVertices, const Cluster** clusters, const int** ROFClusters, + const unsigned char** usedClusters, + const int** clustersIndexTables, + const int iteration, + const float NSigmaCut, + std::vector& phiCuts, + const float resolutionPV, + std::vector& minRs, + std::vector& maxRs, + std::vector& resolutions, std::vector& radii, std::vector& mulScatAng, const int nBlocks, diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h index 309ca2031a9b5..fa4f33782d16a 100644 --- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h +++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h @@ -106,13 +106,16 @@ class TimeFrame float getBeamX() const; float getBeamY() const; - + std::vector& getMinRs() { return mMinR; } + std::vector& getMaxRs() { return mMaxR; } float getMinR(int layer) const { return mMinR[layer]; } float getMaxR(int layer) const { return mMaxR[layer]; } float getMSangle(int layer) const { return mMSangles[layer]; } std::vector& getMSangles() { return mMSangles; } float getPhiCut(int layer) const { return mPhiCuts[layer]; } + std::vector& getPhiCuts() { return mPhiCuts; } float getPositionResolution(int layer) const { return mPositionResolution[layer]; } + std::vector& getPositionResolutions() { return mPositionResolution; } gsl::span getClustersOnLayer(int rofId, int layerId); gsl::span getClustersOnLayer(int rofId, int layerId) const;