diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h index 20632a2c50371..57be54acbbbef 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h @@ -183,7 +183,8 @@ int filterCellNeighboursHandler(std::vector&, void processNeighboursHandler(const int iteration, const int startLayer, const int startLevel, - CellSeed** currentCellSeeds, + CellSeed** allCellSeeds, + CellSeed* currentCellSeeds, const unsigned int nCurrentCells, // const int* currentCellIds, // const unsigned int nCurrentCellsIds, diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx index d46e81421879f..3e89605e592be 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx @@ -91,13 +91,12 @@ template void TrackerTraitsGPU::computeTrackletsHybrid(const int iteration, int iROFslice, int iVertex) { auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance(); - // TrackerTraits::computeLayerTracklets(iteration, iROFslice, iVertex); mTimeFrameGPU->createTrackletsLUTDevice(iteration); const Vertex diamondVert({mTrkParams[iteration].Diamond[0], mTrkParams[iteration].Diamond[1], mTrkParams[iteration].Diamond[2]}, {25.e-6f, 0.f, 0.f, 25.e-6f, 0.f, 36.f}, 1, 1.f); gsl::span diamondSpan(&diamondVert, 1); int startROF{mTrkParams[iteration].nROFsPerIterations > 0 ? iROFslice * mTrkParams[iteration].nROFsPerIterations : 0}; - int endROF{mTrkParams[iteration].nROFsPerIterations > 0 ? (iROFslice + 1) * mTrkParams[iteration].nROFsPerIterations + mTrkParams[iteration].DeltaROF : mTimeFrameGPU->getNrof()}; + int endROF{o2::gpu::CAMath::Min(mTrkParams[iteration].nROFsPerIterations > 0 ? (iROFslice + 1) * mTrkParams[iteration].nROFsPerIterations + mTrkParams[iteration].DeltaROF : mTimeFrameGPU->getNrof(), mTimeFrameGPU->getNrof())}; countTrackletsInROFsHandler(mTimeFrameGPU->getDeviceIndexTableUtils(), mTimeFrameGPU->getDeviceMultCutMask(), @@ -284,6 +283,7 @@ void TrackerTraitsGPU::findRoads(const int iteration) startLayer, startLevel, mTimeFrameGPU->getDeviceArrayCells(), + mTimeFrameGPU->getDeviceCells()[startLayer], mTimeFrameGPU->getNCells()[startLayer], mTimeFrameGPU->getDeviceArrayUsedClusters(), mTimeFrameGPU->getDeviceNeighbours(startLayer - 1), @@ -384,8 +384,8 @@ void TrackerTraitsGPU::findRoads(const int iteration) } mTimeFrameGPU->getTracks(std::min(rofs[0], rofs[1])).emplace_back(track); } + mTimeFrameGPU->loadUsedClustersDevice(); } - mTimeFrameGPU->loadUsedClustersDevice(); if (iteration == mTrkParams.size() - 1) { mTimeFrameGPU->unregisterHostMemory(0); } diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu index 500eff04826cb..32a433cc2ffce 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu @@ -192,9 +192,6 @@ GPUd() o2::track::TrackParCov buildTrackSeed(const Cluster& cluster1, 0.f, 0.f, 0.f, 0.f, sg2q2pt}); } -// auto sort_tracklets = [] GPUhdni()(const Tracklet& a, const Tracklet& b) { return a.firstClusterIndex < b.firstClusterIndex || (a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex < b.secondClusterIndex); }; -// auto equal_tracklets = [] GPUhdni()(const Tracklet& a, const Tracklet& b) { return a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex == b.secondClusterIndex; }; - struct sort_tracklets { GPUhd() bool operator()(const Tracklet& a, const Tracklet& b) { return a.firstClusterIndex < b.firstClusterIndex || (a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex < b.secondClusterIndex); } }; @@ -584,14 +581,13 @@ GPUg() void compileTrackletsLookupTableKernel(const Tracklet* tracklets, } } -template -GPUg() void processNeighboursKernel(const int iteration, - const int layer, +template +GPUg() void processNeighboursKernel(const int layer, const int level, - CellSeed** currentCellSeeds, - const unsigned int nCurrentCells, + CellSeed** allCellSeeds, + CellSeed* currentCellSeeds, const int* currentCellIds, - // const unsigned int nCurrentCellsIds, + const unsigned int nCurrentCells, CellSeed* updatedCellSeeds, int* updatedCellsIds, int* foundSeedsTable, // auxiliary only in GPU code to compute the number of cells per iteration @@ -609,7 +605,12 @@ GPUg() void processNeighboursKernel(const int iteration, constexpr float layerxX0[7] = {5.e-3f, 5.e-3f, 5.e-3f, 1.e-2f, 1.e-2f, 1.e-2f, 1.e-2f}; // Hardcoded here for the moment. for (unsigned int iCurrentCell = blockIdx.x * blockDim.x + threadIdx.x; iCurrentCell < nCurrentCells; iCurrentCell += blockDim.x * gridDim.x) { int foundSeeds{0}; - const auto& currentCell{currentCellSeeds[layer][iCurrentCell]}; + const auto& currentCell{currentCellSeeds[iCurrentCell]}; + if constexpr (debug) { + if (threadIdx.x == 0 && currentCellIds != nullptr) { + currentCellSeeds[iCurrentCell].printCell(); + } + } if (currentCell.getLevel() != level) { continue; } @@ -619,12 +620,17 @@ GPUg() void processNeighboursKernel(const int iteration, continue; } const int cellId = currentCellIds == nullptr ? iCurrentCell : currentCellIds[iCurrentCell]; + // if constexpr (debug) { + // if (threadIdx.x == 0 && currentCellIds != nullptr) { + // printf("-> ic: %d cellId: %d \n", iCurrentCell, cellId); + // } + // } const int startNeighbourId{cellId ? neighboursLUT[cellId - 1] : 0}; const int endNeighbourId{neighboursLUT[cellId]}; for (int iNeighbourCell{startNeighbourId}; iNeighbourCell < endNeighbourId; ++iNeighbourCell) { const int neighbourCellId = neighbours[iNeighbourCell]; - const CellSeed& neighbourCell = currentCellSeeds[layer - 1][neighbourCellId]; + const CellSeed& neighbourCell = allCellSeeds[layer - 1][neighbourCellId]; if (neighbourCell.getSecondTrackletIndex() != currentCell.getFirstTrackletIndex()) { continue; } @@ -684,6 +690,7 @@ GPUd() void pPointer(T* ptr) { printf("[%p]\t", ptr); } + template GPUg() void printPointersKernel(std::tuple args) { @@ -791,6 +798,15 @@ GPUg() void printTrackletsLUTPerROF(const int layerId, } } } + +GPUg() void printCellSeeds(CellSeed* seed, int nCells, const unsigned int tId = 0) +{ + for (unsigned int iCell{0}; iCell < nCells; ++iCell) { + if (threadIdx.x == tId) { + seed[iCell].printCell(); + } + } +} } // namespace gpu template @@ -1163,7 +1179,8 @@ int filterCellNeighboursHandler(std::vector& neighHost, // TODO: eventually void processNeighboursHandler(const int iteration, const int startLayer, const int startLevel, - CellSeed** currentCellSeeds, + CellSeed** allCellSeeds, + CellSeed* currentCellSeeds, const unsigned int nCurrentCells, // const int* currentCellIds, // const unsigned int nCurrentCellsIds, @@ -1181,12 +1198,12 @@ void processNeighboursHandler(const int iteration, const int nThreads) { thrust::device_vector foundSeedsTable(nCurrentCells + 1); // Shortcut: device_vector skips central memory management, we are relying on the contingency. TODO: fix this. - gpu::processNeighboursKernel<<>>(iteration, - startLayer, + gpu::processNeighboursKernel<<>>(startLayer, startLevel, + allCellSeeds, currentCellSeeds, - nCurrentCells, nullptr, // currentCellIds, + nCurrentCells, nullptr, // updatedCellSeeds, nullptr, // updatedCellsIds, thrust::raw_pointer_cast(&foundSeedsTable[0]), // auxiliary only in GPU code to compute the number of cells per iteration @@ -1217,30 +1234,88 @@ void processNeighboursHandler(const int iteration, thrust::device_vector updatedCellIds(foundSeedsTable.back()), lastCellIds(foundSeedsTable.back()); thrust::device_vector updatedCellSeeds(foundSeedsTable.back()), lastCellSeeds(foundSeedsTable.back()); - gpu::processNeighboursKernel<<<1, 1>>>(iteration, - startLayer, - startLevel, - currentCellSeeds, - nCurrentCells, - nullptr, // currentCellIds, - thrust::raw_pointer_cast(&updatedCellSeeds[0]), // updatedCellSeeds - thrust::raw_pointer_cast(&updatedCellIds[0]), // updatedCellsIds - thrust::raw_pointer_cast(&foundSeedsTable[0]), // auxiliary only in GPU code to compute the number of cells per iteration - usedClusters, // Used clusters - neighbours, - neighboursLUT, - foundTrackingFrameInfo, - bz, - MaxChi2ClusterAttachment, - propagator, - matCorrType); + gpu::processNeighboursKernel<<>>(startLayer, + startLevel, + allCellSeeds, + currentCellSeeds, + nullptr, // currentCellIds, + nCurrentCells, + thrust::raw_pointer_cast(&updatedCellSeeds[0]), // updatedCellSeeds + thrust::raw_pointer_cast(&updatedCellIds[0]), // updatedCellsIds + thrust::raw_pointer_cast(&foundSeedsTable[0]), // auxiliary only in GPU code to compute the number of cells per iteration + usedClusters, // Used clusters + neighbours, + neighboursLUT, + foundTrackingFrameInfo, + bz, + MaxChi2ClusterAttachment, + propagator, + matCorrType); + LOGP(info, "1: updatedCellIds {} - updatedCellSeeds {}", updatedCellIds.size(), updatedCellSeeds.size()); + // gpu::printCellSeeds<<<1, 1>>>(thrust::raw_pointer_cast(&updatedCellSeeds[0]), foundSeedsTable.back()); + // gpu::printBufferLayerOnThread<<<1, 1>>>(0, thrust::raw_pointer_cast(&updatedCellIds[0]), updatedCellIds.size()); int level = startLevel; for (int iLayer{startLayer - 1}; iLayer > 0 && level > 2; --iLayer) { - lastCellSeeds.swap(updatedCellSeeds); - lastCellIds.swap(updatedCellIds); - // std::vector().swap(updatedCellSeeds); /// tame the memory peaks - // updatedCellId.clear(); - // processNeighbours(iLayer, --level, lastCellSeed, lastCellId, updatedCellSeed, updatedCellId); + LOGP(info, "iLayer: {} level {}", iLayer, level); + --level; + // lastCellSeeds.swap(updatedCellSeeds); + // lastCellIds.swap(updatedCellIds); + // // gpu::printCellSeeds<<<1, 1>>>(thrust::raw_pointer_cast(&lastCellSeeds[0]), lastCellSeeds.size()); + // foundSeedsTable.resize(lastCellSeeds.size() + 1); + // thrust::fill(foundSeedsTable.begin(), foundSeedsTable.end(), 0); + // gpu::processNeighboursKernel<<<1, 1>>>(iLayer, + // level, + // allCellSeeds, + // thrust::raw_pointer_cast(&lastCellSeeds[0]), + // thrust::raw_pointer_cast(&lastCellIds[0]), // currentCellIds, + // lastCellSeeds.size(), + // nullptr, // updatedCellSeeds, + // nullptr, // updatedCellsIds, + // thrust::raw_pointer_cast(&foundSeedsTable[0]), // auxiliary only in GPU code to compute the number of cells per iteration + // usedClusters, // Used clusters + // neighbours, + // neighboursLUT, + // foundTrackingFrameInfo, + // bz, + // MaxChi2ClusterAttachment, + // propagator, + // matCorrType); + // gpu::printBufferLayerOnThread<<<1, 1>>>(0, thrust::raw_pointer_cast(&foundSeedsTable[0]), foundSeedsTable.size()); + // gpuCheckError(cudaPeekAtLastError()); + // gpuCheckError(cudaDeviceSynchronize()); + // gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage_2, // d_temp_storage + // temp_storage_bytes, // temp_storage_bytes + // thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_in + // thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_out + // lastCellSeeds.size() + 1, // num_items + // 0)); + // discardResult(cudaMalloc(&d_temp_storage, temp_storage_bytes)); + // gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage_2, // d_temp_storage + // temp_storage_bytes_2, // temp_storage_bytes + // thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_in + // thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_out + // lastCellSeeds.size() + 1, // num_items + // 0)); + // updatedCellIds.resize(foundSeedsTable.back(), 0); + // updatedCellSeeds.resize(foundSeedsTable.back(), CellSeed()); + // gpu::processNeighboursKernel<<>>(iLayer, + // level, + // allCellSeeds, + // thrust::raw_pointer_cast(&lastCellSeeds[0]), + // thrust::raw_pointer_cast(&lastCellIds[0]), // currentCellIds, + // lastCellSeeds.size(), + // thrust::raw_pointer_cast(&updatedCellSeeds[0]), // updatedCellSeeds + // thrust::raw_pointer_cast(&updatedCellIds[0]), // updatedCellsIds + // thrust::raw_pointer_cast(&foundSeedsTable[0]), // auxiliary only in GPU code to compute the number of cells per iteration + // usedClusters, // Used clusters + // neighbours, + // neighboursLUT, + // foundTrackingFrameInfo, + // bz, + // MaxChi2ClusterAttachment, + // propagator, + // matCorrType); + // LOGP(info, " -> 2: updatedCellIds {} - updatedCellSeeds {}", updatedCellIds.size(), updatedCellSeeds.size()); } gpuCheckError(cudaFree(d_temp_storage)); diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Cell.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Cell.h index cb9f28665cf07..257e57dbd3fda 100644 --- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Cell.h +++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Cell.h @@ -15,7 +15,7 @@ #ifndef TRACKINGITSU_INCLUDE_CACELL_H_ #define TRACKINGITSU_INCLUDE_CACELL_H_ - +#include #ifndef GPUCA_GPUCODE_DEVICE #include #include @@ -104,6 +104,7 @@ class CellSeed final : public o2::track::TrackParCovF GPUhd() int* getLevelPtr() { return &mLevel; } GPUhd() int* getClusters() { return mClusters; } GPUhd() int getCluster(int i) const { return mClusters[i]; } + GPUhdi() void printCell(); private: int mClusters[7] = {-1, -1, -1, -1, -1, -1, -1}; @@ -112,6 +113,11 @@ class CellSeed final : public o2::track::TrackParCovF float mChi2 = 0.f; }; +GPUhdi() void CellSeed::printCell() +{ + printf("trkl: %d, %d\t lvl: %d\t chi2: %f\n", mTracklets[0], mTracklets[1], mLevel, mChi2); +} + } // namespace its } // namespace o2 #endif /* TRACKINGITSU_INCLUDE_CACELL_H_ */ diff --git a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx index 35498711a6677..0461f9e3c43e9 100644 --- a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx +++ b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx @@ -486,7 +486,7 @@ void TrackerTraits::processNeighbours(int iLayer, int failed[5]{0, 0, 0, 0, 0}, attempts{0}, failedByMismatch{0}; #endif -#pragma omp parallel for num_threads(mNThreads) + // #pragma omp parallel for num_threads(mNThreads) for (unsigned int iCell = 0; iCell < currentCellSeed.size(); ++iCell) { const CellSeed& currentCell{currentCellSeed[iCell]}; if (currentCell.getLevel() != iLevel) { @@ -498,6 +498,9 @@ void TrackerTraits::processNeighbours(int iLayer, continue; /// this we do only on the first iteration, hence the check on currentCellId } const int cellId = currentCellId.empty() ? iCell : currentCellId[iCell]; + // if (!currentCellId.empty()) { + // printf("-> ic: %d cellId: %d \n", iCell, cellId); + // } const int startNeighbourId{cellId ? mTimeFrame->getCellsNeighboursLUT()[iLayer - 1][cellId - 1] : 0}; const int endNeighbourId{mTimeFrame->getCellsNeighboursLUT()[iLayer - 1][cellId]}; @@ -583,14 +586,26 @@ void TrackerTraits::findRoads(const int iteration) std::vector lastCellSeed, updatedCellSeed; processNeighbours(startLayer, startLevel, mTimeFrame->getCells()[startLayer], lastCellId, updatedCellSeed, updatedCellId); - + LOGP(info, "1: updatedCellIds {} - updatedCellSeeds {}", updatedCellId.size(), updatedCellSeed.size()); + // for (auto& c : updatedCellSeed) { + // c.printCell(); + // } + // for (int i{0}; i < updatedCellId.size(); ++i) { + // if (!(i % 150)) { + // printf("\n layer %d: ===> %d/%d\t", 0, i, (int)updatedCellId.size()); + // } + // printf("%d\t", updatedCellId[i]); + // } + // printf("\n"); int level = startLevel; for (int iLayer{startLayer - 1}; iLayer > 0 && level > 2; --iLayer) { + LOGP(info, "iLayer: {} level {}", iLayer, level); lastCellSeed.swap(updatedCellSeed); lastCellId.swap(updatedCellId); std::vector().swap(updatedCellSeed); /// tame the memory peaks updatedCellId.clear(); processNeighbours(iLayer, --level, lastCellSeed, lastCellId, updatedCellSeed, updatedCellId); + // LOGP(info, " -> 2: updatedCellIds {} - updatedCellSeeds {}", updatedCellId.size(), updatedCellSeed.size()); } for (auto& seed : updatedCellSeed) { if (seed.getQ2Pt() > 1.e3 || seed.getChi2() > mTrkParams[0].MaxChi2NDF * ((startLevel + 2) * 2 - 5)) {