diff --git a/Common/MathUtils/include/MathUtils/SMatrixGPU.h b/Common/MathUtils/include/MathUtils/SMatrixGPU.h index ef76c490ddfbd..60965a4fa2776 100644 --- a/Common/MathUtils/include/MathUtils/SMatrixGPU.h +++ b/Common/MathUtils/include/MathUtils/SMatrixGPU.h @@ -446,6 +446,8 @@ class SMatrixGPU GPUdi() SMatrixGPU(SMatrixNoInit) {} GPUd() SMatrixGPU(SMatrixIdentity); GPUd() SMatrixGPU(const SMatrixGPU& rhs); + template + GPUd() SMatrixGPU(const SMatrixGPU& rhs); template GPUd() SMatrixGPU(const Expr& rhs); template @@ -497,6 +499,11 @@ class SMatrixGPU GPUd() SMatrixRowGPU operator[](unsigned int i) { return SMatrixRowGPU(*this, i); } template GPUd() SMatrixGPU& operator+=(const SMatrixGPU& rhs); + GPUd() SMatrixGPU& operator*=(const T& rhs); + template + GPUd() SMatrixGPU& operator*=(const SMatrixGPU& rhs); + template + GPUd() SMatrixGPU& operator*=(const Expr& rhs); GPUd() bool Invert(); GPUd() bool IsInUse(const T* p) const; @@ -528,6 +535,13 @@ GPUdi() SMatrixGPU::SMatrixGPU(const SMatrixGPU& rhs mRep = rhs.mRep; } +template +template +GPUd() SMatrixGPU::SMatrixGPU(const SMatrixGPU& rhs) +{ + operator=(rhs); +} + template GPUdi() T* SMatrixGPU::begin() { @@ -1387,6 +1401,29 @@ GPUdi() SMatrixGPU& SMatrixGPU::operator+=(const SMa return *this; } +template +GPUdi() SMatrixGPU& SMatrixGPU::operator*=(const T & rhs) +{ + for (unsigned int i = 0; i < R::kSize; ++i) { + mRep.Array()[i] *= rhs; + } + return *this; +} + +template +template +GPUdi() SMatrixGPU& SMatrixGPU::operator*=(const SMatrixGPU& rhs) +{ + return operator=(*this* rhs); +} + +template +template +GPUdi() SMatrixGPU& SMatrixGPU::operator*=(const Expr& rhs) +{ + return operator=(*this* rhs); +} + template struct TranspPolicyGPU { enum { diff --git a/DataFormats/Reconstruction/include/ReconstructionDataFormats/TrackParametrizationWithError.h b/DataFormats/Reconstruction/include/ReconstructionDataFormats/TrackParametrizationWithError.h index 536bacf1a6a70..015b5d37e258c 100644 --- a/DataFormats/Reconstruction/include/ReconstructionDataFormats/TrackParametrizationWithError.h +++ b/DataFormats/Reconstruction/include/ReconstructionDataFormats/TrackParametrizationWithError.h @@ -18,6 +18,7 @@ #define INCLUDE_RECONSTRUCTIONDATAFORMATS_TRACKPARAMETRIZATIONWITHERROR_H_ #include "ReconstructionDataFormats/TrackParametrization.h" +#include namespace o2 { @@ -38,8 +39,8 @@ class TrackParametrizationWithError : public TrackParametrization #endif using covMat_t = gpu::gpustd::array; - using MatrixDSym5 = ROOT::Math::SMatrix>; - using MatrixD5 = ROOT::Math::SMatrix>; + using MatrixDSym5 = o2::math_utils::SMatrix>; + using MatrixD5 = o2::math_utils::SMatrix>; GPUd() TrackParametrizationWithError(); GPUd() TrackParametrizationWithError(value_t x, value_t alpha, const params_t& par, const covMat_t& cov, int charge = 1, const PID pid = PID::Pion); @@ -100,12 +101,12 @@ class TrackParametrizationWithError : public TrackParametrization template GPUd() value_t getPredictedChi2(const BaseCluster& p) const; - void buildCombinedCovMatrix(const TrackParametrizationWithError& rhs, MatrixDSym5& cov) const; - value_t getPredictedChi2(const TrackParametrizationWithError& rhs, MatrixDSym5& covToSet) const; + GPUd() void buildCombinedCovMatrix(const TrackParametrizationWithError& rhs, MatrixDSym5& cov) const; + GPUd() value_t getPredictedChi2(const TrackParametrizationWithError& rhs, MatrixDSym5& covToSet) const; GPUd() value_t getPredictedChi2(const TrackParametrizationWithError& rhs) const; GPUd() value_t getPredictedChi2Quiet(const TrackParametrizationWithError& rhs) const; - bool update(const TrackParametrizationWithError& rhs, const MatrixDSym5& covInv); - bool update(const TrackParametrizationWithError& rhs); + GPUd() bool update(const TrackParametrizationWithError& rhs, const MatrixDSym5& covInv); + GPUd() bool update(const TrackParametrizationWithError& rhs); GPUd() bool update(const dim2_t& p, const dim3_t& cov); GPUd() bool update(const value_t* p, const value_t* cov); diff --git a/DataFormats/Reconstruction/src/TrackParametrizationWithError.cxx b/DataFormats/Reconstruction/src/TrackParametrizationWithError.cxx index e56830deace14..9f8a93a01e053 100644 --- a/DataFormats/Reconstruction/src/TrackParametrizationWithError.cxx +++ b/DataFormats/Reconstruction/src/TrackParametrizationWithError.cxx @@ -13,6 +13,7 @@ #include "ReconstructionDataFormats/Vertex.h" #include "ReconstructionDataFormats/DCA.h" #include +#include #ifndef GPUCA_GPUCODE_DEVICE #include @@ -754,30 +755,6 @@ GPUd() auto TrackParametrizationWithError::getPredictedChi2Quiet(const return (d * (szz * d - sdz * z) + z * (sdd * z - d * sdz)) / det; } -#if !defined(GPUCA_GPUCODE) && !defined(GPUCA_STANDALONE) // Disable function relying on ROOT SMatrix on GPU - -//______________________________________________ -template -void TrackParametrizationWithError::buildCombinedCovMatrix(const TrackParametrizationWithError& rhs, MatrixDSym5& cov) const -{ - // fill combined cov.matrix (NOT inverted) - cov(kY, kY) = static_cast(getSigmaY2()) + static_cast(rhs.getSigmaY2()); - cov(kZ, kY) = static_cast(getSigmaZY()) + static_cast(rhs.getSigmaZY()); - cov(kZ, kZ) = static_cast(getSigmaZ2()) + static_cast(rhs.getSigmaZ2()); - cov(kSnp, kY) = static_cast(getSigmaSnpY()) + static_cast(rhs.getSigmaSnpY()); - cov(kSnp, kZ) = static_cast(getSigmaSnpZ()) + static_cast(rhs.getSigmaSnpZ()); - cov(kSnp, kSnp) = static_cast(getSigmaSnp2()) + static_cast(rhs.getSigmaSnp2()); - cov(kTgl, kY) = static_cast(getSigmaTglY()) + static_cast(rhs.getSigmaTglY()); - cov(kTgl, kZ) = static_cast(getSigmaTglZ()) + static_cast(rhs.getSigmaTglZ()); - cov(kTgl, kSnp) = static_cast(getSigmaTglSnp()) + static_cast(rhs.getSigmaTglSnp()); - cov(kTgl, kTgl) = static_cast(getSigmaTgl2()) + static_cast(rhs.getSigmaTgl2()); - cov(kQ2Pt, kY) = static_cast(getSigma1PtY()) + static_cast(rhs.getSigma1PtY()); - cov(kQ2Pt, kZ) = static_cast(getSigma1PtZ()) + static_cast(rhs.getSigma1PtZ()); - cov(kQ2Pt, kSnp) = static_cast(getSigma1PtSnp()) + static_cast(rhs.getSigma1PtSnp()); - cov(kQ2Pt, kTgl) = static_cast(getSigma1PtTgl()) + static_cast(rhs.getSigma1PtTgl()); - cov(kQ2Pt, kQ2Pt) = static_cast(getSigma1Pt2()) + static_cast(rhs.getSigma1Pt2()); -} - //______________________________________________ template GPUd() auto TrackParametrizationWithError::getPredictedChi2(const TrackParametrizationWithError& rhs) const -> value_t @@ -819,6 +796,28 @@ GPUd() auto TrackParametrizationWithError::getPredictedChi2(const Track return chi2diag + 2. * chi2ndiag; } +//______________________________________________ +template +GPUd() void TrackParametrizationWithError::buildCombinedCovMatrix(const TrackParametrizationWithError& rhs, MatrixDSym5& cov) const +{ + // fill combined cov.matrix (NOT inverted) + cov(kY, kY) = static_cast(getSigmaY2()) + static_cast(rhs.getSigmaY2()); + cov(kZ, kY) = static_cast(getSigmaZY()) + static_cast(rhs.getSigmaZY()); + cov(kZ, kZ) = static_cast(getSigmaZ2()) + static_cast(rhs.getSigmaZ2()); + cov(kSnp, kY) = static_cast(getSigmaSnpY()) + static_cast(rhs.getSigmaSnpY()); + cov(kSnp, kZ) = static_cast(getSigmaSnpZ()) + static_cast(rhs.getSigmaSnpZ()); + cov(kSnp, kSnp) = static_cast(getSigmaSnp2()) + static_cast(rhs.getSigmaSnp2()); + cov(kTgl, kY) = static_cast(getSigmaTglY()) + static_cast(rhs.getSigmaTglY()); + cov(kTgl, kZ) = static_cast(getSigmaTglZ()) + static_cast(rhs.getSigmaTglZ()); + cov(kTgl, kSnp) = static_cast(getSigmaTglSnp()) + static_cast(rhs.getSigmaTglSnp()); + cov(kTgl, kTgl) = static_cast(getSigmaTgl2()) + static_cast(rhs.getSigmaTgl2()); + cov(kQ2Pt, kY) = static_cast(getSigma1PtY()) + static_cast(rhs.getSigma1PtY()); + cov(kQ2Pt, kZ) = static_cast(getSigma1PtZ()) + static_cast(rhs.getSigma1PtZ()); + cov(kQ2Pt, kSnp) = static_cast(getSigma1PtSnp()) + static_cast(rhs.getSigma1PtSnp()); + cov(kQ2Pt, kTgl) = static_cast(getSigma1PtTgl()) + static_cast(rhs.getSigma1PtTgl()); + cov(kQ2Pt, kQ2Pt) = static_cast(getSigma1Pt2()) + static_cast(rhs.getSigma1Pt2()); +} + //______________________________________________ template GPUd() bool TrackParametrizationWithError::update(const TrackParametrizationWithError& rhs, const MatrixDSym5& covInv) @@ -867,7 +866,7 @@ GPUd() bool TrackParametrizationWithError::update(const TrackParametriz } // updated covariance: Cov0 = Cov0 - K*Cov0 - matK *= ROOT::Math::SMatrix>(matC0); + matK *= o2::math_utils::SMatrix>(matC0); mC[kSigY2] -= matK(kY, kY); mC[kSigZY] -= matK(kZ, kY); mC[kSigZ2] -= matK(kZ, kZ); @@ -901,8 +900,6 @@ GPUd() bool TrackParametrizationWithError::update(const TrackParametriz return update(rhs, covI); } -#endif - //______________________________________________ template GPUd() bool TrackParametrizationWithError::update(const value_t* p, const value_t* cov) @@ -1245,6 +1242,7 @@ GPUd() void TrackParametrizationWithError::printHexadecimal() namespace o2::track { + #if !defined(GPUCA_GPUCODE) || defined(GPUCA_GPUCODE_DEVICE) // FIXME: DR: WORKAROUND to avoid CUDA bug creating host symbols for device code. template class TrackParametrizationWithError; #endif diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h index b0f9e8ac55c6f..9b39416042f7b 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h @@ -26,12 +26,9 @@ namespace o2 { -// namespace gpu -// { -// // class GPUChainITS; -// } namespace its -{namespace gpu +{ +namespace gpu { class DefaultGPUAllocator : public ExternalAllocator @@ -59,13 +56,17 @@ class TimeFrameGPU : public TimeFrame void loadClustersDevice(); void loadTrackletsDevice(); void loadCellsDevice(); + void loadCellsLUT(); void loadTrackSeedsDevice(); void loadTrackSeedsChi2Device(); void loadRoadsDevice(); void loadTrackSeedsDevice(std::vector&); - void createCellNeighboursDevice(const unsigned int& layer, std::vector>& neighbours); + void createNeighboursDevice(const unsigned int& layer, std::vector>& neighbours); + void createNeighboursLUTDevice(const int, const unsigned int); void createTrackITSExtDevice(std::vector&); void downloadTrackITSExtDevice(std::vector&); + void downloadCellsNeighbours(std::vector>>&, const int); + void downloadNeighboursLUT(std::vector&, const int); void initDeviceChunks(const int, const int); template size_t loadChunkData(const size_t, const size_t, const size_t); @@ -92,6 +93,7 @@ class TimeFrameGPU : public TimeFrame // Hybrid Road* getDeviceRoads() { return mRoadsDevice; } TrackITSExt* getDeviceTrackITSExt() { return mTrackITSExtDevice; } + int* getDeviceNeighboursLUT(const int layer) { return mNeighboursLUTDevice[layer]; } gpuPair* getDeviceNeighbours(const int layer) { return mNeighboursDevice[layer]; } TrackingFrameInfo* getDeviceTrackingFrameInfo(const int); // TrackingFrameInfo** getDeviceArrayTrackingFrameInfo() { return mTrackingFrameInfoDeviceArray; } @@ -99,10 +101,14 @@ class TimeFrameGPU : public TimeFrame Cluster** getDeviceArrayClusters() const { return mClustersDeviceArray; } Cluster** getDeviceArrayUnsortedClusters() const { return mUnsortedClustersDeviceArray; } Tracklet** getDeviceArrayTracklets() const { return mTrackletsDeviceArray; } + int** getDeviceArrayCellsLUT() const { return mCellsLUTDeviceArray; } + int** getDeviceArrayNeighboursCellLUT() const { return mNeighboursCellLUTDeviceArray; } CellSeed** getDeviceArrayCells() const { return mCellsDeviceArray; } CellSeed* getDeviceTrackSeeds() { return mTrackSeedsDevice; } o2::track::TrackParCovF** getDeviceArrayTrackSeeds() { return mCellSeedsDeviceArray; } float** getDeviceArrayTrackSeedsChi2() { return mCellSeedsChi2DeviceArray; } + int* getNeighboursIndexTablesDevice(const int layer) { return mNeighboursIndexTablesDevice[layer]; } + void setDevicePropagator(const o2::base::PropagatorImpl*) override; // Host-specific getters @@ -131,9 +137,13 @@ class TimeFrameGPU : public TimeFrame Cluster** mUnsortedClustersDeviceArray; std::array mTrackletsDevice; Tracklet** mTrackletsDeviceArray; - std::array mCellsLookupTablesDevice; - int** mCellsLookupTablesDeviceArray; + std::array mCellsLUTDevice; + std::array mNeighboursLUTDevice; + int** mCellsLUTDeviceArray; + int** mNeighboursCellDeviceArray; + int** mNeighboursCellLUTDeviceArray; std::array mCellsDevice; + std::array mNeighboursIndexTablesDevice; CellSeed* mTrackSeedsDevice; CellSeed** mCellsDeviceArray; std::array mCellSeedsDevice; diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h index cc74456bbb1aa..c8b8d3080eea4 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h @@ -49,6 +49,33 @@ GPUg() void fitTrackSeedsKernel( const o2::base::PropagatorF::MatCorrType matCorrType = o2::base::PropagatorF::MatCorrType::USEMatCorrLUT); #endif } // namespace gpu +void countCellNeighboursHandler(CellSeed** cellsLayersDevice, + int* neighboursLUTs, + int** cellsLUTs, + gpuPair* cellNeighbours, + int* neighboursIndexTable, + const float maxChi2ClusterAttachment, + const float bz, + const int layerIndex, + const unsigned int nCells, + const unsigned int nCellsNext, + const int maxCellNeighbours, + const int nBlocks, + const int nThreads); + +void computeCellNeighboursHandler(CellSeed** cellsLayersDevice, + int* neighboursLUTs, + int** cellsLUTs, + gpuPair* cellNeighbours, + int* neighboursIndexTable, + const float maxChi2ClusterAttachment, + const float bz, + const int layerIndex, + const unsigned int nCells, + const unsigned int nCellsNext, + const int maxCellNeighbours, + const int nBlocks, + const int nThreads); void trackSeedHandler(CellSeed* trackSeeds, const TrackingFrameInfo** foundTrackingFrameInfo, diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt index 062009954a5e8..27630a4deeeef 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt @@ -13,7 +13,7 @@ if(CUDA_ENABLED) find_package(CUDAToolkit) message(STATUS "Building ITS CUDA tracker") - +add_compile_options(-O0 -g -fPIC) o2_add_library(ITStrackingCUDA SOURCES ClusterLinesGPU.cu Context.cu @@ -32,7 +32,7 @@ o2_add_library(ITStrackingCUDA O2::SimulationDataFormat O2::ReconstructionDataFormats O2::GPUCommon - CUDA::nvToolsExt # TODO: change to CUDA::nvtx3 when CMake bump >= 3.25 + CUDA::nvToolsExt PRIVATE_LINK_LIBRARIES O2::GPUTrackingCUDAExternalProvider TARGETVARNAME targetName) diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu index 8f61ffaf1bea3..50d8be7d7f717 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu @@ -45,13 +45,10 @@ using utils::checkGPUError; void* DefaultGPUAllocator::allocate(size_t size) { - LOGP(info, "Called DefaultGPUAllocator::allocate with size {}", size); + LOGP(fatal, "Called DefaultGPUAllocator::allocate with size {}", size); return nullptr; // to be implemented } -///////////////////////////////////////////////////////////////////////////////////////// -// TimeFrameGPU -///////////////////////////////////////////////////////////////////////////////////////// template TimeFrameGPU::TimeFrameGPU() { @@ -79,105 +76,6 @@ void TimeFrameGPU::setDevicePropagator(const o2::base::PropagatorImpl -void TimeFrameGPU::registerHostMemory(const int maxLayers) -{ - if (mHostRegistered) { - return; - } else { - mHostRegistered = true; - } - for (auto iLayer{0}; iLayer < maxLayers; ++iLayer) { - checkGPUError(cudaHostRegister(mClusters[iLayer].data(), mClusters[iLayer].size() * sizeof(Cluster), cudaHostRegisterPortable)); - checkGPUError(cudaHostRegister(mNClustersPerROF[iLayer].data(), mNClustersPerROF[iLayer].size() * sizeof(int), cudaHostRegisterPortable)); - checkGPUError(cudaHostRegister(mIndexTables[iLayer].data(), (mStaticTrackingParams.ZBins * mStaticTrackingParams.PhiBins + 1) * mNrof * sizeof(int), cudaHostRegisterPortable)); - } - checkGPUError(cudaHostRegister(mHostNTracklets.data(), (nLayers - 1) * mGpuParams.nTimeFrameChunks * sizeof(int), cudaHostRegisterPortable)); - checkGPUError(cudaHostRegister(mHostNCells.data(), (nLayers - 2) * mGpuParams.nTimeFrameChunks * sizeof(int), cudaHostRegisterPortable)); -} - -template -void TimeFrameGPU::unregisterHostMemory(const int maxLayers) -{ - for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { - checkGPUError(cudaHostUnregister(mTrackingFrameInfo[iLayer].data())); - } - checkGPUError(cudaHostUnregister(mTrackingFrameInfoDevice.data())); -} - -template -void TimeFrameGPU::initialise(const int iteration, - const TrackingParameters& trkParam, - const int maxLayers, - IndexTableUtils* utils, - const TimeFrameGPUParameters* gpuParam) -{ - mGpuStreams.resize(mGpuParams.nTimeFrameChunks); - o2::its::TimeFrame::initialise(iteration, trkParam, maxLayers); -} - -template -void TimeFrameGPU::wipe(const int maxLayers) -{ - unregisterHostMemory(maxLayers); -} - -template -void TimeFrameGPU::initDevice(IndexTableUtils* utils, - const TrackingParameters& trkParam, - const TimeFrameGPUParameters& gpuParam, - const int maxLayers, - const int iteration) -{ - // mStaticTrackingParams.ZBins = trkParam.ZBins; - // mStaticTrackingParams.PhiBins = trkParam.PhiBins; - // if (mFirstInit) { - // mGpuParams = gpuParam; - // allocMemAsync(reinterpret_cast(&mTrackingParamsDevice), sizeof(gpu::StaticTrackingParameters), nullptr, true); - // checkGPUError(cudaMemcpy(mTrackingParamsDevice, &mStaticTrackingParams, sizeof(gpu::StaticTrackingParameters), cudaMemcpyHostToDevice)); - // if (utils) { // If utils is not nullptr, then its gpu vertexing - // mIndexTableUtils = *utils; - // allocMemAsync(reinterpret_cast(&mIndexTableUtilsDevice), sizeof(IndexTableUtils), nullptr, true); - // } else { // GPU tracking otherwise - // mIndexTableUtils.setTrackingParameters(trkParam); - // } - - // mMemChunks.resize(mGpuParams.nTimeFrameChunks, GpuTimeFrameChunk{static_cast(this), mGpuParams}); - // mVerticesInChunks.resize(mGpuParams.nTimeFrameChunks); - // mNVerticesInChunks.resize(mGpuParams.nTimeFrameChunks); - // mLabelsInChunks.resize(mGpuParams.nTimeFrameChunks); - // LOGP(info, "Size of fixed part is: {} MB", GpuTimeFrameChunk::computeFixedSizeBytes(mGpuParams) / MB); - // LOGP(info, "Size of scaling part is: {} MB", GpuTimeFrameChunk::computeScalingSizeBytes(GpuTimeFrameChunk::computeRofPerChunk(mGpuParams, mAvailMemGB), mGpuParams) / MB); - // LOGP(info, "Allocating {} chunks of {} rofs capacity each.", mGpuParams.nTimeFrameChunks, mGpuParams.nROFsPerChunk); - - // for (int iChunk{0}; iChunk < mMemChunks.size(); ++iChunk) { - // mMemChunks[iChunk].allocate(GpuTimeFrameChunk::computeRofPerChunk(mGpuParams, mGpuParams.maxGPUMemoryGB), mGpuStreams[iChunk]); - // } - // for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { - // allocMemAsync(reinterpret_cast(&mROframesClustersDevice[iLayer]), mROframesClusters[iLayer].size() * sizeof(int), nullptr, true); - // allocMemAsync(reinterpret_cast(&(mUsedClustersDevice[iLayer])), sizeof(unsigned char) * mGpuParams.clustersPerROfCapacity * mNrof, nullptr, true); - // } - // allocMemAsync(reinterpret_cast(&mVerticesDevice), sizeof(Vertex) * mGpuParams.maxVerticesCapacity, nullptr, true); - // allocMemAsync(reinterpret_cast(&mROframesPVDevice), sizeof(int) * (mNrof + 1), nullptr, true); - - // mFirstInit = false; - // } - // if (maxLayers < nLayers) { // Vertexer - // for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { - // checkGPUError(cudaMemcpy(mROframesClustersDevice[iLayer], mROframesClusters[iLayer].data(), mROframesClusters[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice)); - // } - // } else { // Tracker - // checkGPUError(cudaMemcpy(mVerticesDevice, mPrimaryVertices.data(), sizeof(Vertex) * mPrimaryVertices.size(), cudaMemcpyHostToDevice)); - // checkGPUError(cudaMemcpy(mROframesPVDevice, mROframesPV.data(), sizeof(int) * mROframesPV.size(), cudaMemcpyHostToDevice)); - // if (!iteration) { - // for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { - // checkGPUError(cudaMemset(mUsedClustersDevice[iLayer], 0, sizeof(unsigned char) * mGpuParams.clustersPerROfCapacity * mNrof)); - // } - // } - // } - // checkGPUError(cudaMemcpy(mIndexTableUtilsDevice, &mIndexTableUtils, sizeof(IndexTableUtils), cudaMemcpyHostToDevice)); -} - template void TimeFrameGPU::loadUnsortedClustersDevice() { @@ -246,6 +144,7 @@ void TimeFrameGPU::loadCellsDevice() for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) { LOGP(info, "gpu-transfer: loading {} cell seeds on layer {}, for {} MB.", mCells[iLayer].size(), iLayer, mCells[iLayer].size() * sizeof(CellSeed) / MB); allocMemAsync(reinterpret_cast(&mCellsDevice[iLayer]), mCells[iLayer].size() * sizeof(CellSeed), nullptr, getExtAllocator()); + allocMemAsync(reinterpret_cast(&mNeighboursIndexTablesDevice[iLayer]), mCells[iLayer].size() * sizeof(int), nullptr, getExtAllocator()); // accessory for the neigh. finding. // Register and move data checkGPUError(cudaHostRegister(mCells[iLayer].data(), mCells[iLayer].size() * sizeof(CellSeed), cudaHostRegisterPortable)); checkGPUError(cudaMemcpyAsync(mCellsDevice[iLayer], mCells[iLayer].data(), mCells[iLayer].size() * sizeof(CellSeed), cudaMemcpyHostToDevice, mGpuStreams[0].get())); @@ -255,6 +154,21 @@ void TimeFrameGPU::loadCellsDevice() checkGPUError(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } +template +void TimeFrameGPU::loadCellsLUT() +{ + for (auto iLayer{0}; iLayer < nLayers - 3; ++iLayer) { + LOGP(info, "gpu-transfer: loading {} cell LUTs on layer {}, for {} MB.", mCellsLookupTable[iLayer].size(), iLayer, mCellsLookupTable[iLayer].size() * sizeof(int) / MB); + allocMemAsync(reinterpret_cast(&(mCellsLUTDevice[iLayer])), sizeof(int) * mCellsLookupTable[iLayer].size(), nullptr, getExtAllocator()); + // Register and move data + checkGPUError(cudaHostRegister(mCellsLookupTable[iLayer].data(), mCellsLookupTable[iLayer].size() * sizeof(int), cudaHostRegisterPortable)); + checkGPUError(cudaMemcpyAsync(mCellsLUTDevice[iLayer], mCellsLookupTable[iLayer].data(), mCellsLookupTable[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + } + allocMemAsync(reinterpret_cast(&mCellsLUTDeviceArray), (nLayers - 2) * sizeof(int*), nullptr, getExtAllocator()); + checkGPUError(cudaHostRegister(mCellsLUTDevice.data(), mCellsLUTDevice.size() * sizeof(int*), cudaHostRegisterPortable)); + checkGPUError(cudaMemcpyAsync(mCellsLUTDeviceArray, mCellsLUTDevice.data(), mCellsLUTDevice.size() * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); +} + template void TimeFrameGPU::loadRoadsDevice() { @@ -274,14 +188,21 @@ void TimeFrameGPU::loadTrackSeedsDevice(std::vector& seeds) } template -void TimeFrameGPU::createCellNeighboursDevice(const unsigned int& layer, std::vector>& neighbours) +void TimeFrameGPU::createNeighboursDevice(const unsigned int& layer, std::vector>& neighbours) { mCellsNeighbours[layer].clear(); mCellsNeighbours[layer].resize(neighbours.capacity()); LOGP(info, "gpu-allocation: reserving {} neighbours, for {} MB.", neighbours.capacity(), neighbours.capacity() * sizeof(gpuPair) / MB); allocMemAsync(reinterpret_cast(&mNeighboursDevice[layer]), neighbours.capacity() * sizeof(gpuPair), &(mGpuStreams[0]), getExtAllocator()); - checkGPUError(cudaMemsetAsync(mNeighboursDevice[layer], 0, neighbours.capacity() * sizeof(gpuPair), mGpuStreams[0].get())); - checkGPUError(cudaHostRegister(neighbours.data(), neighbours.capacity() * sizeof(std::pair), cudaHostRegisterPortable)); + checkGPUError(cudaMemsetAsync(mNeighboursDevice[layer], -1, neighbours.capacity() * sizeof(gpuPair), mGpuStreams[0].get())); +} + +template +void TimeFrameGPU::createNeighboursLUTDevice(const int layer, const unsigned int nCells) +{ + LOGP(debug, "gpu-allocation: reserving {} slots for neighbours LUT, for {} MB.", nCells, nCells * sizeof(int) / MB); + allocMemAsync(reinterpret_cast(&mNeighboursLUTDevice[layer]), nCells * sizeof(int), nullptr, getExtAllocator()); // We need one element more to move exc -> inc + checkGPUError(cudaMemsetAsync(mNeighboursLUTDevice[layer], 0, nCells * sizeof(int), mGpuStreams[0].get())); } template @@ -295,6 +216,23 @@ void TimeFrameGPU::createTrackITSExtDevice(std::vector& seeds checkGPUError(cudaHostRegister(mTrackITSExt.data(), seeds.size() * sizeof(o2::its::TrackITSExt), cudaHostRegisterPortable)); } +template +void TimeFrameGPU::downloadCellsNeighbours(std::vector>>& neighbours, const int layer) +{ + LOGP(debug, "gpu-transfer: downloading {} neighbours, for {} MB.", neighbours[layer].size(), neighbours[layer].size() * sizeof(std::pair) / MB); + // TOOD: something less dangerous than assuming the same memory layout of std::pair and gpuPair :) + checkGPUError(cudaMemcpyAsync(neighbours[layer].data(), mNeighboursDevice[layer], neighbours[layer].size() * sizeof(gpuPair), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); + // checkGPUError(cudaHostUnregister(neighbours[layer].data())); + // discardResult(cudaDeviceSynchronize()); +} + +template +void TimeFrameGPU::downloadNeighboursLUT(std::vector& lut, const int layer) +{ + LOGP(info, "gpu-transfer: downloading {} neighbours lut, for {} MB.", lut.size(), lut.size() * sizeof(int) / MB); + checkGPUError(cudaMemcpyAsync(lut.data(), mNeighboursLUTDevice[layer], lut.size() * sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); +} + template void TimeFrameGPU::downloadTrackITSExtDevice(std::vector& seeds) { @@ -302,7 +240,107 @@ void TimeFrameGPU::downloadTrackITSExtDevice(std::vector& see checkGPUError(cudaMemcpyAsync(mTrackITSExt.data(), mTrackITSExtDevice, seeds.size() * sizeof(o2::its::TrackITSExt), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); checkGPUError(cudaHostUnregister(mTrackITSExt.data())); checkGPUError(cudaHostUnregister(seeds.data())); - discardResult(cudaDeviceSynchronize()); + // discardResult(cudaDeviceSynchronize()); +} + +/// Legacy +template +void TimeFrameGPU::registerHostMemory(const int maxLayers) +{ + if (mHostRegistered) { + return; + } else { + mHostRegistered = true; + } + for (auto iLayer{0}; iLayer < maxLayers; ++iLayer) { + checkGPUError(cudaHostRegister(mClusters[iLayer].data(), mClusters[iLayer].size() * sizeof(Cluster), cudaHostRegisterPortable)); + checkGPUError(cudaHostRegister(mNClustersPerROF[iLayer].data(), mNClustersPerROF[iLayer].size() * sizeof(int), cudaHostRegisterPortable)); + checkGPUError(cudaHostRegister(mIndexTables[iLayer].data(), (mStaticTrackingParams.ZBins * mStaticTrackingParams.PhiBins + 1) * mNrof * sizeof(int), cudaHostRegisterPortable)); + } + checkGPUError(cudaHostRegister(mHostNTracklets.data(), (nLayers - 1) * mGpuParams.nTimeFrameChunks * sizeof(int), cudaHostRegisterPortable)); + checkGPUError(cudaHostRegister(mHostNCells.data(), (nLayers - 2) * mGpuParams.nTimeFrameChunks * sizeof(int), cudaHostRegisterPortable)); +} + +template +void TimeFrameGPU::unregisterHostMemory(const int maxLayers) +{ + for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { + checkGPUError(cudaHostUnregister(mTrackingFrameInfo[iLayer].data())); + } + checkGPUError(cudaHostUnregister(mTrackingFrameInfoDevice.data())); +} + +template +void TimeFrameGPU::initialise(const int iteration, + const TrackingParameters& trkParam, + const int maxLayers, + IndexTableUtils* utils, + const TimeFrameGPUParameters* gpuParam) +{ + mGpuStreams.resize(mGpuParams.nTimeFrameChunks); + o2::its::TimeFrame::initialise(iteration, trkParam, maxLayers); +} + +template +void TimeFrameGPU::wipe(const int maxLayers) +{ + unregisterHostMemory(maxLayers); +} + +template +void TimeFrameGPU::initDevice(IndexTableUtils* utils, + const TrackingParameters& trkParam, + const TimeFrameGPUParameters& gpuParam, + const int maxLayers, + const int iteration) +{ + // mStaticTrackingParams.ZBins = trkParam.ZBins; + // mStaticTrackingParams.PhiBins = trkParam.PhiBins; + // if (mFirstInit) { + // mGpuParams = gpuParam; + // allocMemAsync(reinterpret_cast(&mTrackingParamsDevice), sizeof(gpu::StaticTrackingParameters), nullptr, true); + // checkGPUError(cudaMemcpy(mTrackingParamsDevice, &mStaticTrackingParams, sizeof(gpu::StaticTrackingParameters), cudaMemcpyHostToDevice)); + // if (utils) { // If utils is not nullptr, then its gpu vertexing + // mIndexTableUtils = *utils; + // allocMemAsync(reinterpret_cast(&mIndexTableUtilsDevice), sizeof(IndexTableUtils), nullptr, true); + // } else { // GPU tracking otherwise + // mIndexTableUtils.setTrackingParameters(trkParam); + // } + + // mMemChunks.resize(mGpuParams.nTimeFrameChunks, GpuTimeFrameChunk{static_cast(this), mGpuParams}); + // mVerticesInChunks.resize(mGpuParams.nTimeFrameChunks); + // mNVerticesInChunks.resize(mGpuParams.nTimeFrameChunks); + // mLabelsInChunks.resize(mGpuParams.nTimeFrameChunks); + // LOGP(info, "Size of fixed part is: {} MB", GpuTimeFrameChunk::computeFixedSizeBytes(mGpuParams) / MB); + // LOGP(info, "Size of scaling part is: {} MB", GpuTimeFrameChunk::computeScalingSizeBytes(GpuTimeFrameChunk::computeRofPerChunk(mGpuParams, mAvailMemGB), mGpuParams) / MB); + // LOGP(info, "Allocating {} chunks of {} rofs capacity each.", mGpuParams.nTimeFrameChunks, mGpuParams.nROFsPerChunk); + + // for (int iChunk{0}; iChunk < mMemChunks.size(); ++iChunk) { + // mMemChunks[iChunk].allocate(GpuTimeFrameChunk::computeRofPerChunk(mGpuParams, mGpuParams.maxGPUMemoryGB), mGpuStreams[iChunk]); + // } + // for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { + // allocMemAsync(reinterpret_cast(&mROframesClustersDevice[iLayer]), mROframesClusters[iLayer].size() * sizeof(int), nullptr, true); + // allocMemAsync(reinterpret_cast(&(mUsedClustersDevice[iLayer])), sizeof(unsigned char) * mGpuParams.clustersPerROfCapacity * mNrof, nullptr, true); + // } + // allocMemAsync(reinterpret_cast(&mVerticesDevice), sizeof(Vertex) * mGpuParams.maxVerticesCapacity, nullptr, true); + // allocMemAsync(reinterpret_cast(&mROframesPVDevice), sizeof(int) * (mNrof + 1), nullptr, true); + + // mFirstInit = false; + // } + // if (maxLayers < nLayers) { // Vertexer + // for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { + // checkGPUError(cudaMemcpy(mROframesClustersDevice[iLayer], mROframesClusters[iLayer].data(), mROframesClusters[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice)); + // } + // } else { // Tracker + // checkGPUError(cudaMemcpy(mVerticesDevice, mPrimaryVertices.data(), sizeof(Vertex) * mPrimaryVertices.size(), cudaMemcpyHostToDevice)); + // checkGPUError(cudaMemcpy(mROframesPVDevice, mROframesPV.data(), sizeof(int) * mROframesPV.size(), cudaMemcpyHostToDevice)); + // if (!iteration) { + // for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { + // checkGPUError(cudaMemset(mUsedClustersDevice[iLayer], 0, sizeof(unsigned char) * mGpuParams.clustersPerROfCapacity * mNrof)); + // } + // } + // } + // checkGPUError(cudaMemcpy(mIndexTableUtilsDevice, &mIndexTableUtils, sizeof(IndexTableUtils), cudaMemcpyHostToDevice)); } template diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx index 984f39dece4f1..9268802265b08 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx @@ -324,11 +324,15 @@ void TrackerTraitsGPU::computeCellsHybrid(const int iteration) template void TrackerTraitsGPU::findCellsNeighboursHybrid(const int iteration) { - // TrackerTraits::findCellsNeighbours(iteration); + auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance(); + mTimeFrameGPU->loadCellsDevice(); + mTimeFrameGPU->loadCellsLUT(); + std::vector>> cellsNeighboursLayer(mTrkParams[iteration].CellsPerRoad() - 1); for (int iLayer{0}; iLayer < mTrkParams[iteration].CellsPerRoad() - 1; ++iLayer) { const int nextLayerCellsNum{static_cast(mTimeFrameGPU->getCells()[iLayer + 1].size())}; mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].clear(); mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].resize(nextLayerCellsNum, 0); + if (mTimeFrameGPU->getCells()[iLayer + 1].empty() || mTimeFrameGPU->getCellsLookupTable()[iLayer].empty()) { mTimeFrameGPU->getCellsNeighbours()[iLayer].clear(); @@ -336,35 +340,59 @@ void TrackerTraitsGPU::findCellsNeighboursHybrid(const int iteration) } int layerCellsNum{static_cast(mTimeFrameGPU->getCells()[iLayer].size())}; - std::vector> cellsNeighbours; - cellsNeighbours.reserve(nextLayerCellsNum); - mTimeFrameGPU->loadCellsDevice(); - mTimeFrameGPU->createCellNeighboursDevice(iLayer, cellsNeighbours); - - // // // // [...] - // cellNeighboursHandler(mTimeFrameGPU->getDeviceArrayCells(), - // mTimeFrameGPU->getDeviceArrayNeighboursCellLUT(), - // mTimeFrameGPU->getDeviceNeighbours(iLayer), ); - // // // // Compute Cell Neighbours LUT - // // // checkGPUError(cub::DeviceScan::ExclusiveSum(mTimeFrameGPU->getChunk(chunkId).getDeviceCUBTmpBuffer(), // d_temp_storage - // // // mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->tmpCUBBufferSize, // temp_storage_bytes - // // // mTimeFrameGPU->getChunk(chunkId).getDeviceCellNeigboursLookupTables(iLayer), // d_in - // // // mTimeFrameGPU->getChunk(chunkId).getDeviceCellNeigboursLookupTables(iLayer), // d_out - // // // mTimeFrameGPU->getHostNCells(chunkId)[iLayer + 1], // num_items - // // // mTimeFrameGPU->getStream(chunkId).get())); - - // // cellsNeighboursHandler(mTimeFrameGPU->getDeviceNeighbours(iLayer)); - // // // [...] - - // std::sort(cellsNeighbours.begin(), cellsNeighbours.end(), [](const std::pair& a, const std::pair& b) { - // return a.second < b.second; - // }); - // mTimeFrameGPU->getCellsNeighbours()[iLayer].clear(); - // mTimeFrameGPU->getCellsNeighbours()[iLayer].reserve(cellsNeighbours.size()); - // for (auto& cellNeighboursIndex : cellsNeighbours) { - // mTimeFrameGPU->getCellsNeighbours()[iLayer].push_back(cellNeighboursIndex.first); - // } - // std::inclusive_scan(mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].begin(), mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].end(), mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].begin()); + mTimeFrameGPU->createNeighboursLUTDevice(iLayer, nextLayerCellsNum); + countCellNeighboursHandler(mTimeFrameGPU->getDeviceArrayCells(), + mTimeFrameGPU->getDeviceNeighboursLUT(iLayer), // LUT is initialised here. + mTimeFrameGPU->getDeviceArrayCellsLUT(), + mTimeFrameGPU->getDeviceNeighbours(iLayer), + mTimeFrameGPU->getNeighboursIndexTablesDevice(iLayer), + mTrkParams[0].MaxChi2ClusterAttachment, + mBz, + iLayer, + layerCellsNum, + nextLayerCellsNum, + 1e2, + conf.nBlocks, + conf.nThreads); + mTimeFrameGPU->downloadNeighboursLUT(mTimeFrameGPU->getCellsNeighboursLUT()[iLayer], iLayer); + // Get the number of found cells from LUT + cellsNeighboursLayer[iLayer].resize(mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].back()); + mTimeFrameGPU->createNeighboursDevice(iLayer, cellsNeighboursLayer[iLayer]); + computeCellNeighboursHandler(mTimeFrameGPU->getDeviceArrayCells(), + mTimeFrameGPU->getDeviceNeighboursLUT(iLayer), // LUT is initialised here. + mTimeFrameGPU->getDeviceArrayCellsLUT(), + mTimeFrameGPU->getDeviceNeighbours(iLayer), + mTimeFrameGPU->getNeighboursIndexTablesDevice(iLayer), + mTrkParams[0].MaxChi2ClusterAttachment, + mBz, + iLayer, + layerCellsNum, + nextLayerCellsNum, + 1e2, + conf.nBlocks, + conf.nThreads); + mTimeFrameGPU->getCellsNeighbours()[iLayer].clear(); + mTimeFrameGPU->getCellsNeighbours()[iLayer].reserve(cellsNeighboursLayer[iLayer].size()); + mTimeFrameGPU->downloadCellsNeighbours(cellsNeighboursLayer, iLayer); + std::sort(cellsNeighboursLayer[iLayer].begin(), cellsNeighboursLayer[iLayer].end(), [](const std::pair& a, const std::pair& b) { + return a.second < b.second; + }); + for (auto& cellNeighboursIndex : cellsNeighboursLayer[iLayer]) { + if (cellNeighboursIndex.first != -1) { + mTimeFrameGPU->getCellsNeighbours()[iLayer].push_back(cellNeighboursIndex.first); + } + } + + int c = 0; + std::cout << " ==== Layer " << iLayer << " ====" << std::endl; + for (auto& h : mTimeFrameGPU->getCellsNeighboursLUT()[iLayer]) { + std::cout << h << "\t"; + if (!(++c % 150)) { + std::cout << std::endl; + } + } + std::cout << std::endl; + // std::inclusive_scan(mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].begin(), mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].end(), mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].begin()); } }; diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu index 9ccdbf510d09f..4993892bc6e68 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu @@ -262,20 +262,21 @@ GPUg() void processNeighboursKernel(const CellSeed* currentCellSeed, template // Version for new tracker to supersede the old one GPUg() void computeLayerCellNeighboursKernel( CellSeed** cellSeedArray, - int** neighboursLUT, - const int* cellsNextLayerLUT, + int* neighboursLUT, + int* neighboursIndexTable, + int** cellsLUTs, gpuPair* cellNeighbours, const float maxChi2ClusterAttachment, const float bz, const int layerIndex, - const int* nCells, + const unsigned int nCells, const int maxCellNeighbours = 1e2) { - for (int iCurrentCellIndex = blockIdx.x * blockDim.x + threadIdx.x; iCurrentCellIndex < nCells[layerIndex]; iCurrentCellIndex += blockDim.x * gridDim.x) { + for (int iCurrentCellIndex = blockIdx.x * blockDim.x + threadIdx.x; iCurrentCellIndex < nCells; iCurrentCellIndex += blockDim.x * gridDim.x) { const auto& currentCellSeed{cellSeedArray[layerIndex][iCurrentCellIndex]}; const int nextLayerTrackletIndex{currentCellSeed.getSecondTrackletIndex()}; - const int nextLayerFirstCellIndex{cellsNextLayerLUT[nextLayerTrackletIndex]}; - const int nextLayerLastCellIndex{cellsNextLayerLUT[nextLayerTrackletIndex + 1]}; + const int nextLayerFirstCellIndex{cellsLUTs[layerIndex][nextLayerTrackletIndex]}; + const int nextLayerLastCellIndex{cellsLUTs[layerIndex][nextLayerTrackletIndex + 1]}; int foundNeighbours{0}; for (int iNextCell{nextLayerFirstCellIndex}; iNextCell < nextLayerLastCellIndex; ++iNextCell) { CellSeed nextCellSeed{cellSeedArray[layerIndex + 1][iNextCell]}; // Copy @@ -292,13 +293,11 @@ GPUg() void computeLayerCellNeighboursKernel( continue; } if constexpr (initRun) { - atomicAdd(neighboursLUT[layerIndex] + iNextCell, 1); + atomicAdd(neighboursLUT + iNextCell, 1); + foundNeighbours++; + neighboursIndexTable[iCurrentCellIndex]++; } else { - if (foundNeighbours >= maxCellNeighbours) { - printf("its-gpu-neighbours-finder: data loss on layer: %d: number of neightbours exceeded the threshold!\n"); - continue; - } - cellNeighbours[neighboursLUT[layerIndex][iNextCell] + foundNeighbours++] = {iCurrentCellIndex, iNextCell}; + cellNeighbours[neighboursIndexTable[iCurrentCellIndex] + foundNeighbours++] = {iCurrentCellIndex, iNextCell}; // FIXME: this is prone to race conditions: check on level is not atomic const int currentCellLevel{currentCellSeed.getLevel()}; @@ -673,7 +672,7 @@ GPUg() void computeLayerCellsKernel( const int* trackletsCurrentLayerLUT, const int nTrackletsCurrent, CellSeed* cells, - int* cellsLUT, + int* cellsLUTs, const StaticTrackingParameters* trkPars) { for (int iCurrentTrackletIndex = blockIdx.x * blockDim.x + threadIdx.x; iCurrentTrackletIndex < nTrackletsCurrent; iCurrentTrackletIndex += blockDim.x * gridDim.x) { @@ -694,17 +693,17 @@ GPUg() void computeLayerCellsKernel( if (deltaTanLambda / trkPars->CellDeltaTanLambdaSigma < trkPars->NSigmaCut) { if constexpr (!initRun) { - new (cells + cellsLUT[iCurrentTrackletIndex] + foundCells) Cell{currentTracklet.firstClusterIndex, nextTracklet.firstClusterIndex, - nextTracklet.secondClusterIndex, - iCurrentTrackletIndex, - iNextTrackletIndex}; + new (cells + cellsLUTs[iCurrentTrackletIndex] + foundCells) Cell{currentTracklet.firstClusterIndex, nextTracklet.firstClusterIndex, + nextTracklet.secondClusterIndex, + iCurrentTrackletIndex, + iNextTrackletIndex}; } ++foundCells; } } if constexpr (initRun) { // Fill cell Lookup table - cellsLUT[iCurrentTrackletIndex] = foundCells; + cellsLUTs[iCurrentTrackletIndex] = foundCells; } } } @@ -759,29 +758,94 @@ GPUg() void computeLayerRoadsKernel( } } // namespace gpu -template -void cellNeighboursHandler(CellSeed** cellsLayersDevice, - int** neighboursLUTs, - const int* cellsNextLayerLUT, - gpuPair* cellNeighbours, - const float maxChi2ClusterAttachment, - const float bz, - const int layerIndex, - const int* nCells, - const int maxCellNeighbours, - const int nBlocks, - const int nThreads) +void countCellNeighboursHandler(CellSeed** cellsLayersDevice, + int* neighboursLUT, + int** cellsLUTs, + gpuPair* cellNeighbours, + int* neighboursIndexTable, + const float maxChi2ClusterAttachment, + const float bz, + const int layerIndex, + const unsigned int nCells, + const unsigned int nCellsNext, + const int maxCellNeighbours, + const int nBlocks, + const int nThreads) +{ + gpu::computeLayerCellNeighboursKernel<<>>( + cellsLayersDevice, + neighboursLUT, + neighboursIndexTable, + cellsLUTs, + cellNeighbours, + + maxChi2ClusterAttachment, + bz, + layerIndex, + nCells, + maxCellNeighbours); + void *d_temp_storage = nullptr, *d_temp_storage_2 = nullptr; + size_t temp_storage_bytes = 0, temp_storage_bytes_2 = 0; + // gpu::printBufferLayerOnThread<<<1, 1>>>(layerIndex, neighboursLUT, nCellsNext + 1); + gpuCheckError(cub::DeviceScan::InclusiveSum(d_temp_storage, // d_temp_storage + temp_storage_bytes, // temp_storage_bytes + neighboursLUT, // d_in + neighboursLUT, // d_out + nCellsNext + 1)); // num_items + + discardResult(cudaMalloc(&d_temp_storage, temp_storage_bytes)); + gpuCheckError(cub::DeviceScan::InclusiveSum(d_temp_storage, // d_temp_storage + temp_storage_bytes, // temp_storage_bytes + neighboursLUT, // d_in + neighboursLUT, // d_out + nCellsNext + 1)); // num_items + + gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage_2, // d_temp_storage + temp_storage_bytes_2, // temp_storage_bytes + neighboursIndexTable, // d_in + neighboursIndexTable, // d_out + nCells, // num_items + 0)); + discardResult(cudaMalloc(&d_temp_storage_2, temp_storage_bytes_2)); + gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage_2, // d_temp_storage + temp_storage_bytes_2, // temp_storage_bytes + neighboursIndexTable, // d_in + neighboursIndexTable, // d_out + nCells, // num_items + 0)); + gpu::printBufferLayerOnThread<<<1, 1>>>(layerIndex, neighboursLUT, nCellsNext + 1); + gpuCheckError(cudaPeekAtLastError()); + gpuCheckError(cudaDeviceSynchronize()); +} + +void computeCellNeighboursHandler(CellSeed** cellsLayersDevice, + int* neighboursLUT, + int** cellsLUTs, + gpuPair* cellNeighbours, + int* neighboursIndexTable, + const float maxChi2ClusterAttachment, + const float bz, + const int layerIndex, + const unsigned int nCells, + const unsigned int nCellsNext, + const int maxCellNeighbours, + const int nBlocks, + const int nThreads) { - gpu::computeLayerCellNeighboursKernel<<>>( + gpu::computeLayerCellNeighboursKernel<<>>( cellsLayersDevice, - neighboursLUTs, - cellsNextLayerLUT, + neighboursLUT, + neighboursIndexTable, + cellsLUTs, cellNeighbours, maxChi2ClusterAttachment, bz, layerIndex, nCells, maxCellNeighbours); + gpuCheckError(cudaPeekAtLastError()); + gpuCheckError(cudaDeviceSynchronize()); + // Eventually, here will go the sorting. } void trackSeedHandler(CellSeed* trackSeeds, diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h index 08e50cf9ea824..906eb0fa5c21e 100644 --- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h +++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h @@ -288,6 +288,7 @@ class TimeFrame std::vector> mRoads; std::vector> mTracks; std::vector> mCellsNeighbours; + std::vector> mCellsLookupTable; const o2::base::PropagatorImpl* mPropagatorDevice = nullptr; // Needed only for GPU protected: @@ -315,7 +316,6 @@ class TimeFrame std::vector> mPValphaX; /// PV x and alpha for track propagation std::vector> mTrackletLabels; std::vector> mCellLabels; - std::vector> mCellsLookupTable; std::vector> mCellsNeighboursLUT; std::vector> mTracksLabel; std::vector mBogusClusters; /// keep track of clusters with wild coordinates diff --git a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx index 4457d4515e0a6..4fe59bcfc4729 100644 --- a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx +++ b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx @@ -450,6 +450,7 @@ void TrackerTraits::findCellsNeighbours(const int iteration) } mTimeFrame->getCellsNeighboursLUT()[iLayer][iNextCell]++; + printf("%d -> %d\n", iCell, iNextCell); cellsNeighbours.push_back(std::make_pair(iCell, iNextCell)); const int currentCellLevel{currentCellSeed.getLevel()}; @@ -467,6 +468,15 @@ void TrackerTraits::findCellsNeighbours(const int iteration) mTimeFrame->getCellsNeighbours()[iLayer].push_back(cellNeighboursIndex.first); } std::inclusive_scan(mTimeFrame->getCellsNeighboursLUT()[iLayer].begin(), mTimeFrame->getCellsNeighboursLUT()[iLayer].end(), mTimeFrame->getCellsNeighboursLUT()[iLayer].begin()); + // int c = 0; + // std::cout << " ==== Layer " << iLayer << " ====" << std::endl; + // for (auto& h : mTimeFrame->getCellsNeighboursLUT()[iLayer]) { + // std::cout << h << "\t"; + // if (!(++c % 150)) { + // std::cout << std::endl; + // } + // } + // std::cout << std::endl; } }