diff --git a/CMakeLists.txt b/CMakeLists.txt index d28f191021fdf..d50f2e2e1bc55 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,7 +16,9 @@ cmake_minimum_required(VERSION 3.27.1 FATAL_ERROR) # it's important to specify accurately the list of languages. for instance C and # C++ as we _do_ have some C files to compile explicitely as C (e.g. gl3w.c) project(O2 LANGUAGES C CXX VERSION 1.2.0) - +add_definitions(-DGPUCA_NO_FAST_MATH=1) +set(GPUCA_NO_FAST_MATH 1) +set(GPUCA_NO_FAST_MATH_WHOLEO2 1) include(CTest) # Project wide setup diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h index 12a5f877c0135..73955be325ff7 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h @@ -184,7 +184,6 @@ class TimeFrameGPU : public TimeFrame void registerHostMemory(const int); void unregisterHostMemory(const int); void initialise(const int, const TrackingParameters&, const int, IndexTableUtils* utils = nullptr, const TimeFrameGPUParameters* pars = nullptr); - void initialiseHybrid(const int, const TrackingParameters&, const int, IndexTableUtils* utils = nullptr, const TimeFrameGPUParameters* pars = nullptr); void initDevice(IndexTableUtils*, const TrackingParameters& trkParam, const TimeFrameGPUParameters&, const int, const int); void initDeviceSAFitting(); void loadTrackingFrameInfoDevice(const int); diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h index e96125bdd3bc7..cc74456bbb1aa 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h @@ -59,6 +59,8 @@ void trackSeedHandler(CellSeed* trackSeeds, float maxChi2ClusterAttachment, float maxChi2NDF, const o2::base::Propagator* propagator, - const o2::base::PropagatorF::MatCorrType matCorrType); + const o2::base::PropagatorF::MatCorrType matCorrType, + const int nBlocks, + const int nThreads); } // namespace o2::its #endif // ITSTRACKINGGPU_TRACKINGKERNELS_H_ diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu index ff9ef6c80b9a3..05edc847f1e05 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu @@ -362,27 +362,6 @@ void TimeFrameGPU::initialise(const int iteration, const int maxLayers, IndexTableUtils* utils, const TimeFrameGPUParameters* gpuParam) -{ - mGpuStreams.resize(mGpuParams.nTimeFrameChunks); - mHostNTracklets.resize((nLayers - 1) * mGpuParams.nTimeFrameChunks, 0); - mHostNCells.resize((nLayers - 2) * mGpuParams.nTimeFrameChunks, 0); - - auto init = [&]() -> void { - this->initDevice(utils, trkParam, *gpuParam, maxLayers, iteration); - }; - std::thread t1{init}; - RANGE("tf_cpu_initialisation", 1); - o2::its::TimeFrame::initialise(iteration, trkParam, maxLayers); - // registerHostMemory(maxLayers); - t1.join(); -} - -template -void TimeFrameGPU::initialiseHybrid(const int iteration, - const TrackingParameters& trkParam, - const int maxLayers, - IndexTableUtils* utils, - const TimeFrameGPUParameters* gpuParam) { mGpuStreams.resize(mGpuParams.nTimeFrameChunks); o2::its::TimeFrame::initialise(iteration, trkParam, maxLayers); diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx index 7f1d6812bc6cd..ac8b3f87b874c 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx @@ -20,6 +20,7 @@ #include "ITStrackingGPU/TrackerTraitsGPU.h" #include "ITStrackingGPU/TrackingKernels.h" +#include "ITStracking/TrackingConfigParam.h" namespace o2::its { @@ -28,7 +29,7 @@ constexpr int UnusedIndex{-1}; template void TrackerTraitsGPU::initialiseTimeFrame(const int iteration) { - mTimeFrameGPU->initialiseHybrid(iteration, mTrkParams[iteration], nLayers); + mTimeFrameGPU->initialise(iteration, mTrkParams[iteration], nLayers); mTimeFrameGPU->loadTrackingFrameInfoDevice(iteration); } @@ -397,7 +398,7 @@ void TrackerTraitsGPU::findRoads(const int iteration) } mTimeFrameGPU->createTrackITSExtDevice(trackSeeds); mTimeFrameGPU->loadTrackSeedsDevice(trackSeeds); - + auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance(); trackSeedHandler( mTimeFrameGPU->getDeviceTrackSeeds(), // CellSeed* trackSeeds, mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(), // TrackingFrameInfo** foundTrackingFrameInfo, @@ -408,7 +409,9 @@ void TrackerTraitsGPU::findRoads(const int iteration) mTrkParams[0].MaxChi2ClusterAttachment, // float maxChi2ClusterAttachment, mTrkParams[0].MaxChi2NDF, // float maxChi2NDF, mTimeFrameGPU->getDevicePropagator(), // const o2::base::Propagator* propagator - mCorrType); // o2::base::PropagatorImpl::MatCorrType + mCorrType, // o2::base::PropagatorImpl::MatCorrType + conf.nBlocks, + conf.nThreads); mTimeFrameGPU->downloadTrackITSExtDevice(trackSeeds); diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu index 2374e7b8d04a2..60683e5fea30b 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu @@ -717,9 +717,11 @@ void trackSeedHandler(CellSeed* trackSeeds, float maxChi2ClusterAttachment, float maxChi2NDF, const o2::base::Propagator* propagator, - const o2::base::PropagatorF::MatCorrType matCorrType) + const o2::base::PropagatorF::MatCorrType matCorrType, + const int nBlocks, + const int nThreads) { - gpu::fitTrackSeedsKernel<<<20, 256>>>( + gpu::fitTrackSeedsKernel<<>>( trackSeeds, // CellSeed* trackSeeds, foundTrackingFrameInfo, // TrackingFrameInfo** foundTrackingFrameInfo, tracks, // o2::its::TrackITSExt* tracks, @@ -734,4 +736,4 @@ void trackSeedHandler(CellSeed* trackSeeds, gpuCheckError(cudaPeekAtLastError()); gpuCheckError(cudaDeviceSynchronize()); } -} // namespace o2::its +} // namespace o2::its \ No newline at end of file diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/VertexerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/VertexerTraitsGPU.cxx index 0e03dd0f25ce4..a26d52b2961c3 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/VertexerTraitsGPU.cxx +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/VertexerTraitsGPU.cxx @@ -37,6 +37,7 @@ void VertexerTraitsGPU::initialise(const TrackingParameters& trackingParams, con { mTimeFrameGPU->initialise(0, trackingParams, 3, &mIndexTableUtils, &mTfGPUParams); } + void VertexerTraitsGPU::updateVertexingParameters(const std::vector& vrtPar, const TimeFrameGPUParameters& tfPar) { mVrtParams = vrtPar; diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/VertexingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/VertexingKernels.cu index 9e99687c3be6a..2ba4471ef61e5 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/VertexingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/VertexingKernels.cu @@ -39,8 +39,23 @@ void trackletFinderHandler(const Cluster* clustersNextLayer, // 0 2 const unsigned int startRofId, const unsigned int rofSize, const float phiCut, - const size_t maxTrackletsPerCluster) + const unsigned int maxTrackletsPerCluster, + const int nBlocks, + const int nThreads) { + gpu::trackleterKernelMultipleRof<<>>( + clustersNextLayer, // const Cluster* clustersNextLayer, // 0 2 + clustersCurrentLayer, // const Cluster* clustersCurrentLayer, // 1 1 + sizeNextLClusters, // const int* sizeNextLClusters, + sizeCurrentLClusters, // const int* sizeCurrentLClusters, + nextIndexTables, // const int* nextIndexTables, + Tracklets, // Tracklet* Tracklets, + foundTracklets, // int* foundTracklets, + utils, // const IndexTableUtils* utils, + startRofId, // const unsigned int startRofId, + rofSize, // const unsigned int rofSize, + phiCut, // const float phiCut, + maxTrackletsPerCluster); // const unsigned int maxTrackletsPerCluster = 1e2 } /* GPUd() float smallestAngleDifference(float a, float b) @@ -96,7 +111,7 @@ GPUd() void printOnBlock(const unsigned int bId, const char* str, Args... args) } } -GPUg() void printBufferOnThread(const int* v, size_t size, const int len = 150, const unsigned int tId = 0) +GPUg() void printBufferOnThread(const int* v, unsigned int size, const int len = 150, const unsigned int tId = 0) { if (blockIdx.x * blockDim.x + threadIdx.x == tId) { for (int i{0}; i < size; ++i) { @@ -109,7 +124,7 @@ GPUg() void printBufferOnThread(const int* v, size_t size, const int len = 150, } } -GPUg() void printBufferOnThreadF(const float* v, size_t size, const unsigned int tId = 0) +GPUg() void printBufferOnThreadF(const float* v, unsigned int size, const unsigned int tId = 0) { if (blockIdx.x * blockDim.x + threadIdx.x == tId) { printf("vector :"); @@ -127,7 +142,7 @@ GPUg() void resetTrackletsKernel(Tracklet* tracklets, const int nTracklets) } } -GPUg() void dumpFoundTrackletsKernel(const Tracklet* tracklets, const int* nTracklet, const size_t nClustersMiddleLayer, const int maxTrackletsPerCluster) +GPUg() void dumpFoundTrackletsKernel(const Tracklet* tracklets, const int* nTracklet, const unsigned int nClustersMiddleLayer, const int maxTrackletsPerCluster) { for (int iCurrentLayerClusterIndex = blockIdx.x * blockDim.x + threadIdx.x; iCurrentLayerClusterIndex < nClustersMiddleLayer; iCurrentLayerClusterIndex += blockDim.x * gridDim.x) { const int stride{iCurrentLayerClusterIndex * maxTrackletsPerCluster}; @@ -160,7 +175,7 @@ GPUg() void trackleterKernelSingleRof( int* foundTracklets, const IndexTableUtils* utils, const short rofId, - const size_t maxTrackletsPerCluster = 1e2) + const unsigned int maxTrackletsPerCluster = 1e2) { const int phiBins{utils->getNphiBins()}; const int zBins{utils->getNzBins()}; @@ -168,7 +183,7 @@ GPUg() void trackleterKernelSingleRof( for (int iCurrentLayerClusterIndex = blockIdx.x * blockDim.x + threadIdx.x; iCurrentLayerClusterIndex < sizeCurrentLClusters; iCurrentLayerClusterIndex += blockDim.x * gridDim.x) { if (iCurrentLayerClusterIndex < sizeCurrentLClusters) { unsigned int storedTracklets{0}; - const size_t stride{iCurrentLayerClusterIndex * maxTrackletsPerCluster}; + const unsigned int stride{iCurrentLayerClusterIndex * maxTrackletsPerCluster}; const Cluster& currentCluster = clustersCurrentLayer[iCurrentLayerClusterIndex]; const int4 selectedBinsRect{VertexerTraits::getBinsRect(currentCluster, (int)Mode, 0.f, 50.f, phiCut / 2, *utils)}; if (selectedBinsRect.x != 0 || selectedBinsRect.y != 0 || selectedBinsRect.z != 0 || selectedBinsRect.w != 0) { @@ -218,7 +233,7 @@ GPUg() void trackleterKernelMultipleRof( const short startRofId, const short rofSize, const float phiCut, - const size_t maxTrackletsPerCluster = 1e2) + const unsigned int maxTrackletsPerCluster = 1e2) { const int phiBins{utils->getNphiBins()}; const int zBins{utils->getNzBins()}; @@ -235,7 +250,7 @@ GPUg() void trackleterKernelMultipleRof( // single rof loop on layer1 clusters for (int iCurrentLayerClusterIndex = threadIdx.x; iCurrentLayerClusterIndex < nClustersCurrentLayerRof; iCurrentLayerClusterIndex += blockDim.x) { unsigned int storedTracklets{0}; - const size_t stride{iCurrentLayerClusterIndex * maxTrackletsPerCluster}; + const unsigned int stride{iCurrentLayerClusterIndex * maxTrackletsPerCluster}; const Cluster& currentCluster = clustersCurrentLayerRof[iCurrentLayerClusterIndex]; const int4 selectedBinsRect{VertexerTraits::getBinsRect(currentCluster, (int)Mode, 0.f, 50.f, phiCut / 2, *utils)}; if (selectedBinsRect.x != 0 || selectedBinsRect.y != 0 || selectedBinsRect.z != 0 || selectedBinsRect.w != 0) { @@ -276,7 +291,7 @@ template GPUg() void trackletSelectionKernelSingleRof( const Cluster* clusters0, const Cluster* clusters1, - const size_t nClustersMiddleLayer, + const unsigned int nClustersMiddleLayer, Tracklet* tracklets01, Tracklet* tracklets12, const int* nFoundTracklet01, @@ -436,7 +451,7 @@ GPUg() void computeCentroidsKernel( Line* lines, int* nFoundLines, int* nExclusiveFoundLines, - const size_t nClustersMiddleLayer, + const unsigned int nClustersMiddleLayer, float* centroids, const float lowHistX, const float highHistX, @@ -446,7 +461,7 @@ GPUg() void computeCentroidsKernel( { const int nLines = nExclusiveFoundLines[nClustersMiddleLayer - 1] + nFoundLines[nClustersMiddleLayer - 1]; const int maxIterations{nLines * (nLines - 1) / 2}; - for (size_t currentThreadIndex = blockIdx.x * blockDim.x + threadIdx.x; currentThreadIndex < maxIterations; currentThreadIndex += blockDim.x * gridDim.x) { + for (unsigned int currentThreadIndex = blockIdx.x * blockDim.x + threadIdx.x; currentThreadIndex < maxIterations; currentThreadIndex += blockDim.x * gridDim.x) { int iFirstLine = currentThreadIndex / nLines; int iSecondLine = currentThreadIndex % nLines; // All unique pairs @@ -496,7 +511,7 @@ GPUg() void computeZCentroidsKernel( const int binOpeningX, const int binOpeningY) { - for (size_t currentThreadIndex = blockIdx.x * blockDim.x + threadIdx.x; currentThreadIndex < nLines; currentThreadIndex += blockDim.x * gridDim.x) { + for (unsigned int currentThreadIndex = blockIdx.x * blockDim.x + threadIdx.x; currentThreadIndex < nLines; currentThreadIndex += blockDim.x * gridDim.x) { if (tmpVtX[0].value || tmpVtX[1].value) { float tmpX{lowHistX + tmpVtX[0].key * binSizeHistX + binSizeHistX / 2}; int sumWX{tmpVtX[0].value}; @@ -543,7 +558,7 @@ GPUg() void computeVertexKernel( const int minContributors, const int binOpeningZ) { - for (size_t currentThreadIndex = blockIdx.x * blockDim.x + threadIdx.x; currentThreadIndex < binOpeningZ; currentThreadIndex += blockDim.x * gridDim.x) { + for (unsigned int currentThreadIndex = blockIdx.x * blockDim.x + threadIdx.x; currentThreadIndex < binOpeningZ; currentThreadIndex += blockDim.x * gridDim.x) { if (currentThreadIndex == 0) { if (tmpVertexBins[2].value > 1 && (tmpVertexBins[0].value || tmpVertexBins[1].value)) { float z{lowHistZ + tmpVertexBins[2].key * binSizeHistZ + binSizeHistZ / 2}; diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TrackingConfigParam.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TrackingConfigParam.h index ee0ee633a3721..fe5e52bd6277a 100644 --- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TrackingConfigParam.h +++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TrackingConfigParam.h @@ -55,7 +55,6 @@ struct VertexerParamConfig : public o2::conf::ConfigurableParamHelper { - // Use TGeo for mat. budget bool useMatCorrTGeo = false; bool useFastMaterial = false; @@ -89,24 +88,13 @@ struct TrackerParamConfig : public o2::conf::ConfigurableParamHelper { +struct ITSGpuTrackingParamConfig : public o2::conf::ConfigurableParamHelper { // GPU-specific parameters - size_t tmpCUBBufferSize = 1e5; // In average in pp events there are required 4096 bytes - size_t maxTrackletsPerCluster = 1e2; - size_t clustersPerLayerCapacity = 2.5e5; - size_t clustersPerROfCapacity = 1.5e3; - // size_t trackletsCapacity = maxTrackletsPerCluster * clustersPerLayerCapacity; - size_t validatedTrackletsCapacity = 1e5; - size_t cellsLUTsize = validatedTrackletsCapacity; - size_t maxNeighboursSize = 1e4; - size_t neighboursLUTsize = maxNeighboursSize; - size_t maxRoadPerRofSize = 5e2; // pp! - size_t maxLinesCapacity = 1e2; - size_t maxVerticesCapacity = 5e4; - size_t nTimeFramePartitions = 3; - int maxGPUMemoryGB = -1; + unsigned int tmpCUBBufferSize = 1e5; // In average in pp events there are required 4096 bytes + int nBlocks = 20; + int nThreads = 256; - O2ParamDef(GpuRecoParamConfig, "ITSGpuRecoParam"); + O2ParamDef(ITSGpuTrackingParamConfig, "ITSGpuTrackingParam"); }; } // namespace its diff --git a/Detectors/ITSMFT/ITS/tracking/src/TrackingConfigParam.cxx b/Detectors/ITSMFT/ITS/tracking/src/TrackingConfigParam.cxx index 4e67f919ac005..33edd140dd234 100644 --- a/Detectors/ITSMFT/ITS/tracking/src/TrackingConfigParam.cxx +++ b/Detectors/ITSMFT/ITS/tracking/src/TrackingConfigParam.cxx @@ -18,10 +18,10 @@ namespace its { static auto& sVertexerParamITS = o2::its::VertexerParamConfig::Instance(); static auto& sCATrackerParamITS = o2::its::TrackerParamConfig::Instance(); -static auto& sGpuRecoParamITS = o2::its::GpuRecoParamConfig::Instance(); +static auto& sGpuRecoParamITS = o2::its::ITSGpuTrackingParamConfig::Instance(); O2ParamImpl(o2::its::VertexerParamConfig); O2ParamImpl(o2::its::TrackerParamConfig); -O2ParamImpl(o2::its::GpuRecoParamConfig); +O2ParamImpl(o2::its::ITSGpuTrackingParamConfig); } // namespace its } // namespace o2 diff --git a/Detectors/ITSMFT/ITS/tracking/src/TrackingLinkDef.h b/Detectors/ITSMFT/ITS/tracking/src/TrackingLinkDef.h index f388943726b82..b06a4fd7d7d62 100644 --- a/Detectors/ITSMFT/ITS/tracking/src/TrackingLinkDef.h +++ b/Detectors/ITSMFT/ITS/tracking/src/TrackingLinkDef.h @@ -30,7 +30,7 @@ #pragma link C++ class o2::its::TrackerParamConfig + ; #pragma link C++ class o2::conf::ConfigurableParamHelper < o2::its::TrackerParamConfig> + ; -#pragma link C++ class o2::its::GpuRecoParamConfig + ; -#pragma link C++ class o2::conf::ConfigurableParamHelper < o2::its::GpuRecoParamConfig> + ; +#pragma link C++ class o2::its::ITSGpuTrackingParamConfig + ; +#pragma link C++ class o2::conf::ConfigurableParamHelper < o2::its::ITSGpuTrackingParamConfig> + ; #endif diff --git a/Detectors/ITSMFT/ITS/tracking/src/Vertexer.cxx b/Detectors/ITSMFT/ITS/tracking/src/Vertexer.cxx index 69d1ac9e7547c..ccbaf8e26f86e 100644 --- a/Detectors/ITSMFT/ITS/tracking/src/Vertexer.cxx +++ b/Detectors/ITSMFT/ITS/tracking/src/Vertexer.cxx @@ -104,7 +104,8 @@ void Vertexer::getGlobalConfiguration() { auto& vc = o2::its::VertexerParamConfig::Instance(); vc.printKeyValues(true, true); - auto& grc = o2::its::GpuRecoParamConfig::Instance(); + auto& grc = o2::its::ITSGpuTrackingParamConfig::Instance(); + // This is odd: we override only the parameters for the first iteration. // Variations for the next iterations are set in the trackingInterfrace.