diff --git a/RecoTracker/LSTCore/interface/HitsSoA.h b/RecoTracker/LSTCore/interface/HitsSoA.h index b1f5de9eff46e..d26fe61963a0d 100644 --- a/RecoTracker/LSTCore/interface/HitsSoA.h +++ b/RecoTracker/LSTCore/interface/HitsSoA.h @@ -27,8 +27,8 @@ namespace lst { SOA_COLUMN(ArrayIx2, hitRanges), SOA_COLUMN(int, hitRangesLower), SOA_COLUMN(int, hitRangesUpper), - SOA_COLUMN(int8_t, hitRangesnLower), - SOA_COLUMN(int8_t, hitRangesnUpper)) + SOA_COLUMN(int16_t, hitRangesnLower), + SOA_COLUMN(int16_t, hitRangesnUpper)) using HitsSoA = HitsSoALayout<>; using HitsRangesSoA = HitsRangesSoALayout<>; diff --git a/RecoTracker/LSTCore/interface/alpaka/Common.h b/RecoTracker/LSTCore/interface/alpaka/Common.h index 16e53e7bfe099..337ee2ab814de 100644 --- a/RecoTracker/LSTCore/interface/alpaka/Common.h +++ b/RecoTracker/LSTCore/interface/alpaka/Common.h @@ -10,33 +10,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { using namespace ::lst; - Vec3D constexpr elementsPerThread(Vec3D::all(static_cast(1))); - - ALPAKA_FN_HOST ALPAKA_FN_INLINE void lstWarning(std::string warning) { - edm::LogWarning("LST") << warning; - return; - } - - // Adjust grid and block sizes based on backend configuration - template > - ALPAKA_FN_HOST ALPAKA_FN_INLINE WorkDiv createWorkDiv(const Vec& blocksPerGrid, - const Vec& threadsPerBlock, - const Vec& elementsPerThreadArg) { - Vec adjustedBlocks = blocksPerGrid; - Vec adjustedThreads = threadsPerBlock; - - // special overrides for CPU/host cases - if constexpr (std::is_same_v) { - adjustedBlocks = Vec::all(static_cast(1)); - - if constexpr (alpaka::accMatchesTags) { - // Serial execution, set threads to 1 as well - adjustedThreads = Vec::all(static_cast(1)); // probably redundant - } - } - - return WorkDiv(adjustedBlocks, adjustedThreads, elementsPerThreadArg); - } + ALPAKA_FN_HOST ALPAKA_FN_INLINE void lstWarning(std::string warning) { edm::LogWarning("LST") << warning; } // The constants below are usually used in functions like alpaka::math::min(), // expecting a reference (T const&) in the arguments. Hence, diff --git a/RecoTracker/LSTCore/src/alpaka/Hit.h b/RecoTracker/LSTCore/src/alpaka/Hit.h index e5d3eb4226abc..981711daa47f6 100644 --- a/RecoTracker/LSTCore/src/alpaka/Hit.h +++ b/RecoTracker/LSTCore/src/alpaka/Hit.h @@ -1,6 +1,8 @@ #ifndef RecoTracker_LSTCore_src_alpaka_Hit_h #define RecoTracker_LSTCore_src_alpaka_Hit_h +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" + #include "RecoTracker/LSTCore/interface/alpaka/Common.h" #include "RecoTracker/LSTCore/interface/ModulesSoA.h" #include "RecoTracker/LSTCore/interface/alpaka/HitsDeviceCollection.h" @@ -57,15 +59,11 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { } struct ModuleRangesKernel { - template - ALPAKA_FN_ACC void operator()(TAcc const& acc, + ALPAKA_FN_ACC void operator()(Acc1D const& acc, ModulesConst modules, HitsRanges hitsRanges, int nLowerModules) const { - auto const globalThreadIdx = alpaka::getIdx(acc); - auto const gridThreadExtent = alpaka::getWorkDiv(acc); - - for (int lowerIndex = globalThreadIdx[2]; lowerIndex < nLowerModules; lowerIndex += gridThreadExtent[2]) { + for (int lowerIndex : cms::alpakatools::uniform_elements(acc, nLowerModules)) { uint16_t upperIndex = modules.partnerModuleIndices()[lowerIndex]; if (hitsRanges.hitRanges()[lowerIndex][0] != -1 && hitsRanges.hitRanges()[upperIndex][0] != -1) { hitsRanges.hitRangesLower()[lowerIndex] = hitsRanges.hitRanges()[lowerIndex][0]; @@ -80,8 +78,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { }; struct HitLoopKernel { - template - ALPAKA_FN_ACC void operator()(TAcc const& acc, + ALPAKA_FN_ACC void operator()(Acc1D const& acc, uint16_t Endcap, // Integer corresponding to endcap in module subdets uint16_t TwoS, // Integer corresponding to TwoS in moduleType unsigned int nModules, // Number of modules @@ -94,9 +91,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { { auto geoMapDetId = endcapGeometry.geoMapDetId(); // DetId's from endcap map auto geoMapPhi = endcapGeometry.geoMapPhi(); // Phi values from endcap map - auto const globalThreadIdx = alpaka::getIdx(acc); - auto const gridThreadExtent = alpaka::getWorkDiv(acc); - for (unsigned int ihit = globalThreadIdx[2]; ihit < nHits; ihit += gridThreadExtent[2]) { + for (unsigned int ihit : cms::alpakatools::uniform_elements(acc, nHits)) { float ihit_x = hits.xs()[ihit]; float ihit_y = hits.ys()[ihit]; float ihit_z = hits.zs()[ihit]; diff --git a/RecoTracker/LSTCore/src/alpaka/Kernels.h b/RecoTracker/LSTCore/src/alpaka/Kernels.h index 13d0d2b0e1202..4c738259dfd87 100644 --- a/RecoTracker/LSTCore/src/alpaka/Kernels.h +++ b/RecoTracker/LSTCore/src/alpaka/Kernels.h @@ -1,6 +1,8 @@ #ifndef RecoTracker_LSTCore_src_alpaka_Kernels_h #define RecoTracker_LSTCore_src_alpaka_Kernels_h +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" + #include "RecoTracker/LSTCore/interface/alpaka/Common.h" #include "RecoTracker/LSTCore/interface/ModulesSoA.h" #include "RecoTracker/LSTCore/interface/ObjectRangesSoA.h" @@ -139,26 +141,22 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { } struct RemoveDupQuintupletsAfterBuild { - template - ALPAKA_FN_ACC void operator()(TAcc const& acc, + ALPAKA_FN_ACC void operator()(Acc3D const& acc, ModulesConst modules, Quintuplets quintuplets, QuintupletsOccupancyConst quintupletsOccupancy, ObjectRangesConst ranges) const { - auto const globalThreadIdx = alpaka::getIdx(acc); - auto const gridThreadExtent = alpaka::getWorkDiv(acc); - - for (unsigned int lowmod = globalThreadIdx[0]; lowmod < modules.nLowerModules(); lowmod += gridThreadExtent[0]) { + for (unsigned int lowmod : cms::alpakatools::uniform_elements_z(acc, modules.nLowerModules())) { unsigned int nQuintuplets_lowmod = quintupletsOccupancy.nQuintuplets()[lowmod]; int quintupletModuleIndices_lowmod = ranges.quintupletModuleIndices()[lowmod]; - for (unsigned int ix1 = globalThreadIdx[1]; ix1 < nQuintuplets_lowmod; ix1 += gridThreadExtent[1]) { + for (unsigned int ix1 : cms::alpakatools::uniform_elements_y(acc, nQuintuplets_lowmod)) { unsigned int ix = quintupletModuleIndices_lowmod + ix1; float eta1 = __H2F(quintuplets.eta()[ix]); float phi1 = __H2F(quintuplets.phi()[ix]); float score_rphisum1 = __H2F(quintuplets.score_rphisum()[ix]); - for (unsigned int jx1 = globalThreadIdx[2] + ix1 + 1; jx1 < nQuintuplets_lowmod; jx1 += gridThreadExtent[2]) { + for (unsigned int jx1 : cms::alpakatools::uniform_elements_x(acc, ix1 + 1, nQuintuplets_lowmod)) { unsigned int jx = quintupletModuleIndices_lowmod + jx1; float eta2 = __H2F(quintuplets.eta()[jx]); @@ -189,16 +187,11 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { }; struct RemoveDupQuintupletsBeforeTC { - template - ALPAKA_FN_ACC void operator()(TAcc const& acc, + ALPAKA_FN_ACC void operator()(Acc2D const& acc, Quintuplets quintuplets, QuintupletsOccupancyConst quintupletsOccupancy, ObjectRangesConst ranges) const { - auto const globalThreadIdx = alpaka::getIdx(acc); - auto const gridThreadExtent = alpaka::getWorkDiv(acc); - - for (unsigned int lowmodIdx1 = globalThreadIdx[1]; lowmodIdx1 < ranges.nEligibleT5Modules(); - lowmodIdx1 += gridThreadExtent[1]) { + for (unsigned int lowmodIdx1 : cms::alpakatools::uniform_elements_y(acc, ranges.nEligibleT5Modules())) { uint16_t lowmod1 = ranges.indicesOfEligibleT5Modules()[lowmodIdx1]; unsigned int nQuintuplets_lowmod1 = quintupletsOccupancy.nQuintuplets()[lowmod1]; if (nQuintuplets_lowmod1 == 0) @@ -206,8 +199,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { unsigned int quintupletModuleIndices_lowmod1 = ranges.quintupletModuleIndices()[lowmod1]; - for (unsigned int lowmodIdx2 = globalThreadIdx[2] + lowmodIdx1; lowmodIdx2 < ranges.nEligibleT5Modules(); - lowmodIdx2 += gridThreadExtent[2]) { + for (unsigned int lowmodIdx2 : + cms::alpakatools::uniform_elements_x(acc, lowmodIdx1, ranges.nEligibleT5Modules())) { uint16_t lowmod2 = ranges.indicesOfEligibleT5Modules()[lowmodIdx2]; unsigned int nQuintuplets_lowmod2 = quintupletsOccupancy.nQuintuplets()[lowmod2]; if (nQuintuplets_lowmod2 == 0) @@ -272,13 +265,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { }; struct RemoveDupPixelTripletsFromMap { - template - ALPAKA_FN_ACC void operator()(TAcc const& acc, PixelTriplets pixelTriplets) const { - auto const globalThreadIdx = alpaka::getIdx(acc); - auto const gridThreadExtent = alpaka::getWorkDiv(acc); - - for (unsigned int ix = globalThreadIdx[1]; ix < pixelTriplets.nPixelTriplets(); ix += gridThreadExtent[1]) { - for (unsigned int jx = globalThreadIdx[2]; jx < pixelTriplets.nPixelTriplets(); jx += gridThreadExtent[2]) { + ALPAKA_FN_ACC void operator()(Acc2D const& acc, PixelTriplets pixelTriplets) const { + for (unsigned int ix : cms::alpakatools::uniform_elements_y(acc, pixelTriplets.nPixelTriplets())) { + for (unsigned int jx : cms::alpakatools::uniform_elements_x(acc, pixelTriplets.nPixelTriplets())) { if (ix == jx) continue; @@ -306,15 +295,11 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { }; struct RemoveDupPixelQuintupletsFromMap { - template - ALPAKA_FN_ACC void operator()(TAcc const& acc, PixelQuintuplets pixelQuintuplets) const { - auto const globalThreadIdx = alpaka::getIdx(acc); - auto const gridThreadExtent = alpaka::getWorkDiv(acc); - + ALPAKA_FN_ACC void operator()(Acc2D const& acc, PixelQuintuplets pixelQuintuplets) const { unsigned int nPixelQuintuplets = pixelQuintuplets.nPixelQuintuplets(); - for (unsigned int ix = globalThreadIdx[1]; ix < nPixelQuintuplets; ix += gridThreadExtent[1]) { + for (unsigned int ix : cms::alpakatools::uniform_elements_y(acc, nPixelQuintuplets)) { float score1 = __H2F(pixelQuintuplets.score()[ix]); - for (unsigned int jx = globalThreadIdx[2]; jx < nPixelQuintuplets; jx += gridThreadExtent[2]) { + for (unsigned int jx : cms::alpakatools::uniform_elements_x(acc, nPixelQuintuplets)) { if (ix == jx) continue; @@ -333,22 +318,18 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { }; struct CheckHitspLS { - template - ALPAKA_FN_ACC void operator()(TAcc const& acc, + ALPAKA_FN_ACC void operator()(Acc2D const& acc, ModulesConst modules, SegmentsOccupancyConst segmentsOccupancy, SegmentsPixel segmentsPixel, bool secondpass) const { - auto const globalThreadIdx = alpaka::getIdx(acc); - auto const gridThreadExtent = alpaka::getWorkDiv(acc); - int pixelModuleIndex = modules.nLowerModules(); unsigned int nPixelSegments = segmentsOccupancy.nSegments()[pixelModuleIndex]; if (nPixelSegments > n_max_pixel_segments_per_module) nPixelSegments = n_max_pixel_segments_per_module; - for (unsigned int ix = globalThreadIdx[1]; ix < nPixelSegments; ix += gridThreadExtent[1]) { + for (unsigned int ix : cms::alpakatools::uniform_elements_y(acc, nPixelSegments)) { if (secondpass && (!segmentsPixel.isQuad()[ix] || (segmentsPixel.isDup()[ix] & 1))) continue; @@ -360,7 +341,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { float eta_pix1 = segmentsPixel.eta()[ix]; float phi_pix1 = segmentsPixel.phi()[ix]; - for (unsigned int jx = ix + 1 + globalThreadIdx[2]; jx < nPixelSegments; jx += gridThreadExtent[2]) { + for (unsigned int jx : cms::alpakatools::uniform_elements_x(acc, ix + 1, nPixelSegments)) { float eta_pix2 = segmentsPixel.eta()[jx]; float phi_pix2 = segmentsPixel.phi()[jx]; diff --git a/RecoTracker/LSTCore/src/alpaka/LSTEvent.dev.cc b/RecoTracker/LSTCore/src/alpaka/LSTEvent.dev.cc index 8de38b530f93e..d5b209ea2382d 100644 --- a/RecoTracker/LSTCore/src/alpaka/LSTEvent.dev.cc +++ b/RecoTracker/LSTCore/src/alpaka/LSTEvent.dev.cc @@ -1,7 +1,10 @@ #include "HeterogeneousCore/AlpakaInterface/interface/memory.h" +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" #include "LSTEvent.h" +#include "Hit.h" +#include "Kernels.h" #include "MiniDoublet.h" #include "PixelQuintuplet.h" #include "PixelTriplet.h" @@ -123,11 +126,9 @@ void LSTEvent::addHitToEvent(std::vector const& x, alpaka::memcpy(queue_, idxs_d, idxInNtuple, (Idx)nHits); alpaka::wait(queue_); // FIXME: remove synch after inputs refactored to be in pinned memory - Vec3D const threadsPerBlock1{1, 1, 256}; - Vec3D const blocksPerGrid1{1, 1, max_blocks}; - WorkDiv3D const hit_loop_workdiv = createWorkDiv(blocksPerGrid1, threadsPerBlock1, elementsPerThread); + auto const hit_loop_workdiv = cms::alpakatools::make_workdiv(max_blocks, 256); - alpaka::exec(queue_, + alpaka::exec(queue_, hit_loop_workdiv, HitLoopKernel{}, Endcap, @@ -140,11 +141,9 @@ void LSTEvent::addHitToEvent(std::vector const& x, hitsDC_->view(), nHits); - Vec3D const threadsPerBlock2{1, 1, 256}; - Vec3D const blocksPerGrid2{1, 1, max_blocks}; - WorkDiv3D const module_ranges_workdiv = createWorkDiv(blocksPerGrid2, threadsPerBlock2, elementsPerThread); + auto const module_ranges_workdiv = cms::alpakatools::make_workdiv(max_blocks, 256); - alpaka::exec(queue_, + alpaka::exec(queue_, module_ranges_workdiv, ModuleRangesKernel{}, modules_.const_view(), @@ -197,7 +196,7 @@ void LSTEvent::addPixelSegmentToEvent(std::vector const& hitIndice alpaka::memcpy(queue_, dst_view_miniDoubletModuleOccupancy, pixelMaxMDs_buf_h); - WorkDiv1D const createMDArrayRangesGPU_workDiv = createWorkDiv({1}, {1024}, {1}); + auto const createMDArrayRangesGPU_workDiv = cms::alpakatools::make_workdiv(1, 1024); alpaka::exec(queue_, createMDArrayRangesGPU_workDiv, @@ -228,7 +227,7 @@ void LSTEvent::addPixelSegmentToEvent(std::vector const& hitIndice // can be optimized here: because we didn't distinguish pixel segments and outer-tracker segments and call them both "segments", so they use the index continuously. // If we want to further study the memory footprint in detail, we can separate the two and allocate different memories to them - WorkDiv1D const createSegmentArrayRanges_workDiv = createWorkDiv({1}, {1024}, {1}); + auto const createSegmentArrayRanges_workDiv = cms::alpakatools::make_workdiv(1, 1024); alpaka::exec(queue_, createSegmentArrayRanges_workDiv, @@ -309,11 +308,9 @@ void LSTEvent::addPixelSegmentToEvent(std::vector const& hitIndice alpaka::wait(queue_); // FIXME: remove synch after inputs refactored to be in pinned memory - Vec3D const threadsPerBlock{1, 1, 256}; - Vec3D const blocksPerGrid{1, 1, max_blocks}; - WorkDiv3D const addPixelSegmentToEvent_workdiv = createWorkDiv(blocksPerGrid, threadsPerBlock, elementsPerThread); + auto const addPixelSegmentToEvent_workdiv = cms::alpakatools::make_workdiv(max_blocks, 256); - alpaka::exec(queue_, + alpaka::exec(queue_, addPixelSegmentToEvent_workdiv, AddPixelSegmentToEventKernel{}, modules_.const_view(), @@ -343,7 +340,7 @@ void LSTEvent::createMiniDoublets() { alpaka::memcpy(queue_, dst_view_miniDoubletModuleOccupancy, pixelMaxMDs_buf_h); - WorkDiv1D const createMDArrayRangesGPU_workDiv = createWorkDiv({1}, {1024}, {1}); + auto const createMDArrayRangesGPU_workDiv = cms::alpakatools::make_workdiv(1, 1024); alpaka::exec(queue_, createMDArrayRangesGPU_workDiv, @@ -372,12 +369,11 @@ void LSTEvent::createMiniDoublets() { alpaka::memset(queue_, totOccupancyMDs_view, 0u); } - Vec3D const threadsPerBlockCreateMD{1, 16, 32}; - Vec3D const blocksPerGridCreateMD{1, nLowerModules_ / threadsPerBlockCreateMD[1], 1}; - WorkDiv3D const createMiniDoublets_workDiv = - createWorkDiv(blocksPerGridCreateMD, threadsPerBlockCreateMD, elementsPerThread); + constexpr int threadsPerBlockY = 16; + auto const createMiniDoublets_workDiv = + cms::alpakatools::make_workdiv({nLowerModules_ / threadsPerBlockY, 1}, {threadsPerBlockY, 32}); - alpaka::exec(queue_, + alpaka::exec(queue_, createMiniDoublets_workDiv, CreateMiniDoublets{}, modules_.const_view(), @@ -388,7 +384,7 @@ void LSTEvent::createMiniDoublets() { rangesDC_->const_view(), ptCut_); - WorkDiv1D const addMiniDoubletRangesToEventExplicit_workDiv = createWorkDiv({1}, {1024}, {1}); + auto const addMiniDoubletRangesToEventExplicit_workDiv = cms::alpakatools::make_workdiv(1, 1024); alpaka::exec(queue_, addMiniDoubletRangesToEventExplicit_workDiv, @@ -419,10 +415,7 @@ void LSTEvent::createSegmentsWithModuleMap() { alpaka::memset(queue_, totOccupancySegments_view, 0u); } - Vec3D const threadsPerBlockCreateSeg{1, 1, 64}; - Vec3D const blocksPerGridCreateSeg{1, 1, nLowerModules_}; - WorkDiv3D const createSegments_workDiv = - createWorkDiv(blocksPerGridCreateSeg, threadsPerBlockCreateSeg, elementsPerThread); + auto const createSegments_workDiv = cms::alpakatools::make_workdiv({nLowerModules_, 1, 1}, {1, 1, 64}); alpaka::exec(queue_, createSegments_workDiv, @@ -435,7 +428,7 @@ void LSTEvent::createSegmentsWithModuleMap() { rangesDC_->const_view(), ptCut_); - WorkDiv1D const addSegmentRangesToEventExplicit_workDiv = createWorkDiv({1}, {1024}, {1}); + auto const addSegmentRangesToEventExplicit_workDiv = cms::alpakatools::make_workdiv(1, 1024); alpaka::exec(queue_, addSegmentRangesToEventExplicit_workDiv, @@ -451,7 +444,7 @@ void LSTEvent::createSegmentsWithModuleMap() { void LSTEvent::createTriplets() { if (!tripletsDC_) { - WorkDiv1D const createTripletArrayRanges_workDiv = createWorkDiv({1}, {1024}, {1}); + auto const createTripletArrayRanges_workDiv = cms::alpakatools::make_workdiv(1, 1024); alpaka::exec(queue_, createTripletArrayRanges_workDiv, @@ -528,10 +521,7 @@ void LSTEvent::createTriplets() { auto index_gpu_buf = cms::alpakatools::make_device_buffer(queue_, nLowerModules_); alpaka::memcpy(queue_, index_gpu_buf, index_buf_h, nonZeroModules); - Vec3D const threadsPerBlockCreateTrip{1, 16, 16}; - Vec3D const blocksPerGridCreateTrip{max_blocks, 1, 1}; - WorkDiv3D const createTriplets_workDiv = - createWorkDiv(blocksPerGridCreateTrip, threadsPerBlockCreateTrip, elementsPerThread); + auto const createTriplets_workDiv = cms::alpakatools::make_workdiv({max_blocks, 1, 1}, {1, 16, 16}); alpaka::exec(queue_, createTriplets_workDiv, @@ -547,7 +537,7 @@ void LSTEvent::createTriplets() { nonZeroModules, ptCut_); - WorkDiv1D const addTripletRangesToEventExplicit_workDiv = createWorkDiv({1}, {1024}, {1}); + auto const addTripletRangesToEventExplicit_workDiv = cms::alpakatools::make_workdiv(1, 1024); alpaka::exec(queue_, addTripletRangesToEventExplicit_workDiv, @@ -568,12 +558,9 @@ void LSTEvent::createTrackCandidates(bool no_pls_dupclean, bool tc_pls_triplets) alpaka::memset(queue_, buf, 0u); } - Vec3D const threadsPerBlock_crossCleanpT3{1, 16, 64}; - Vec3D const blocksPerGrid_crossCleanpT3{1, 4, 20}; - WorkDiv3D const crossCleanpT3_workDiv = - createWorkDiv(blocksPerGrid_crossCleanpT3, threadsPerBlock_crossCleanpT3, elementsPerThread); + auto const crossCleanpT3_workDiv = cms::alpakatools::make_workdiv({20, 4}, {64, 16}); - alpaka::exec(queue_, + alpaka::exec(queue_, crossCleanpT3_workDiv, CrossCleanpT3{}, modules_.const_view(), @@ -582,7 +569,7 @@ void LSTEvent::createTrackCandidates(bool no_pls_dupclean, bool tc_pls_triplets) segmentsDC_->const_view(), pixelQuintupletsDC_->const_view()); - WorkDiv1D const addpT3asTrackCandidates_workDiv = createWorkDiv({1}, {512}, {1}); + auto const addpT3asTrackCandidates_workDiv = cms::alpakatools::make_workdiv(1, 512); alpaka::exec(queue_, addpT3asTrackCandidates_workDiv, @@ -601,22 +588,21 @@ void LSTEvent::createTrackCandidates(bool no_pls_dupclean, bool tc_pls_triplets) alpaka::wait(queue_); // wait to get the value before using auto const nEligibleModules = *nEligibleModules_buf_h.data(); - Vec3D const threadsPerBlockRemoveDupQuints{1, 16, 32}; - Vec3D const blocksPerGridRemoveDupQuints{1, std::max(nEligibleModules / 16, 1), std::max(nEligibleModules / 32, 1)}; - WorkDiv3D const removeDupQuintupletsBeforeTC_workDiv = - createWorkDiv(blocksPerGridRemoveDupQuints, threadsPerBlockRemoveDupQuints, elementsPerThread); + constexpr int threadsPerBlockY = 16; + constexpr int threadsPerBlockX = 32; + auto const removeDupQuintupletsBeforeTC_workDiv = cms::alpakatools::make_workdiv( + {std::max(nEligibleModules / threadsPerBlockY, 1), std::max(nEligibleModules / threadsPerBlockX, 1)}, {16, 32}); - alpaka::exec(queue_, + alpaka::exec(queue_, removeDupQuintupletsBeforeTC_workDiv, RemoveDupQuintupletsBeforeTC{}, quintupletsDC_->view(), quintupletsDC_->view(), rangesDC_->const_view()); - Vec3D const threadsPerBlock_crossCleanT5{32, 1, 32}; - Vec3D const blocksPerGrid_crossCleanT5{(13296 / 32) + 1, 1, max_blocks}; - WorkDiv3D const crossCleanT5_workDiv = - createWorkDiv(blocksPerGrid_crossCleanT5, threadsPerBlock_crossCleanT5, elementsPerThread); + constexpr int threadsPerBlock = 32; + auto const crossCleanT5_workDiv = cms::alpakatools::make_workdiv( + {(nLowerModules_ / threadsPerBlock) + 1, 1, max_blocks}, {threadsPerBlock, 1, threadsPerBlock}); alpaka::exec(queue_, crossCleanT5_workDiv, @@ -628,12 +614,9 @@ void LSTEvent::createTrackCandidates(bool no_pls_dupclean, bool tc_pls_triplets) pixelTripletsDC_->const_view(), rangesDC_->const_view()); - Vec3D const threadsPerBlock_addT5asTrackCandidate{1, 8, 128}; - Vec3D const blocksPerGrid_addT5asTrackCandidate{1, 8, 10}; - WorkDiv3D const addT5asTrackCandidate_workDiv = - createWorkDiv(blocksPerGrid_addT5asTrackCandidate, threadsPerBlock_addT5asTrackCandidate, elementsPerThread); + auto const addT5asTrackCandidate_workDiv = cms::alpakatools::make_workdiv({8, 10}, {8, 128}); - alpaka::exec(queue_, + alpaka::exec(queue_, addT5asTrackCandidate_workDiv, AddT5asTrackCandidate{}, nLowerModules_, @@ -643,12 +626,9 @@ void LSTEvent::createTrackCandidates(bool no_pls_dupclean, bool tc_pls_triplets) rangesDC_->const_view()); if (!no_pls_dupclean) { - Vec3D const threadsPerBlockCheckHitspLS{1, 16, 16}; - Vec3D const blocksPerGridCheckHitspLS{1, max_blocks * 4, max_blocks / 4}; - WorkDiv3D const checkHitspLS_workDiv = - createWorkDiv(blocksPerGridCheckHitspLS, threadsPerBlockCheckHitspLS, elementsPerThread); + auto const checkHitspLS_workDiv = cms::alpakatools::make_workdiv({max_blocks * 4, max_blocks / 4}, {16, 16}); - alpaka::exec(queue_, + alpaka::exec(queue_, checkHitspLS_workDiv, CheckHitspLS{}, modules_.const_view(), @@ -657,12 +637,9 @@ void LSTEvent::createTrackCandidates(bool no_pls_dupclean, bool tc_pls_triplets) true); } - Vec3D const threadsPerBlock_crossCleanpLS{1, 16, 32}; - Vec3D const blocksPerGrid_crossCleanpLS{1, 4, 20}; - WorkDiv3D const crossCleanpLS_workDiv = - createWorkDiv(blocksPerGrid_crossCleanpLS, threadsPerBlock_crossCleanpLS, elementsPerThread); + auto const crossCleanpLS_workDiv = cms::alpakatools::make_workdiv({20, 4}, {32, 16}); - alpaka::exec(queue_, + alpaka::exec(queue_, crossCleanpLS_workDiv, CrossCleanpLS{}, modules_.const_view(), @@ -676,12 +653,9 @@ void LSTEvent::createTrackCandidates(bool no_pls_dupclean, bool tc_pls_triplets) hitsDC_->const_view(), quintupletsDC_->const_view()); - Vec3D const threadsPerBlock_addpLSasTrackCandidate{1, 1, 384}; - Vec3D const blocksPerGrid_addpLSasTrackCandidate{1, 1, max_blocks}; - WorkDiv3D const addpLSasTrackCandidate_workDiv = - createWorkDiv(blocksPerGrid_addpLSasTrackCandidate, threadsPerBlock_addpLSasTrackCandidate, elementsPerThread); + auto const addpLSasTrackCandidate_workDiv = cms::alpakatools::make_workdiv(max_blocks, 384); - alpaka::exec(queue_, + alpaka::exec(queue_, addpLSasTrackCandidate_workDiv, AddpLSasTrackCandidate{}, nLowerModules_, @@ -813,9 +787,8 @@ void LSTEvent::createPixelTriplets() { alpaka::memcpy(queue_, connectedPixelSize_dev_buf, connectedPixelSize_host_buf, nInnerSegments); alpaka::memcpy(queue_, connectedPixelIndex_dev_buf, connectedPixelIndex_host_buf, nInnerSegments); - Vec3D const threadsPerBlock{1, 4, 32}; - Vec3D const blocksPerGrid{16 /* above median of connected modules*/, 4096, 1}; - WorkDiv3D const createPixelTripletsFromMap_workDiv = createWorkDiv(blocksPerGrid, threadsPerBlock, elementsPerThread); + auto const createPixelTripletsFromMap_workDiv = + cms::alpakatools::make_workdiv({16 /* above median of connected modules*/, 4096, 1}, {1, 4, 32}); alpaka::exec(queue_, createPixelTripletsFromMap_workDiv, @@ -845,18 +818,15 @@ void LSTEvent::createPixelTriplets() { #endif //pT3s can be cleaned here because they're not used in making pT5s! - Vec3D const threadsPerBlockDupPixTrip{1, 16, 16}; //seems like more blocks lead to conflicting writes - Vec3D const blocksPerGridDupPixTrip{1, 40, 1}; - WorkDiv3D const removeDupPixelTripletsFromMap_workDiv = - createWorkDiv(blocksPerGridDupPixTrip, threadsPerBlockDupPixTrip, elementsPerThread); + auto const removeDupPixelTripletsFromMap_workDiv = cms::alpakatools::make_workdiv({40, 1}, {16, 16}); - alpaka::exec( + alpaka::exec( queue_, removeDupPixelTripletsFromMap_workDiv, RemoveDupPixelTripletsFromMap{}, pixelTripletsDC_->view()); } void LSTEvent::createQuintuplets() { - WorkDiv1D const createEligibleModulesListForQuintuplets_workDiv = createWorkDiv({1}, {1024}, {1}); + auto const createEligibleModulesListForQuintuplets_workDiv = cms::alpakatools::make_workdiv(1, 1024); alpaka::exec(queue_, createEligibleModulesListForQuintuplets_workDiv, @@ -899,10 +869,8 @@ void LSTEvent::createQuintuplets() { alpaka::memset(queue_, partOfPT5_view, 0u); } - Vec3D const threadsPerBlockQuints{1, 8, 32}; - Vec3D const blocksPerGridQuints{std::max((int)nEligibleT5Modules, 1), 1, 1}; - WorkDiv3D const createQuintuplets_workDiv = - createWorkDiv(blocksPerGridQuints, threadsPerBlockQuints, elementsPerThread); + auto const createQuintuplets_workDiv = + cms::alpakatools::make_workdiv({std::max((int)nEligibleT5Modules, 1), 1, 1}, {1, 8, 32}); alpaka::exec(queue_, createQuintuplets_workDiv, @@ -918,10 +886,8 @@ void LSTEvent::createQuintuplets() { nEligibleT5Modules, ptCut_); - Vec3D const threadsPerBlockDupQuint{1, 16, 16}; - Vec3D const blocksPerGridDupQuint{max_blocks, 1, 1}; - WorkDiv3D const removeDupQuintupletsAfterBuild_workDiv = - createWorkDiv(blocksPerGridDupQuint, threadsPerBlockDupQuint, elementsPerThread); + auto const removeDupQuintupletsAfterBuild_workDiv = + cms::alpakatools::make_workdiv({max_blocks, 1, 1}, {1, 16, 16}); alpaka::exec(queue_, removeDupQuintupletsAfterBuild_workDiv, @@ -931,7 +897,7 @@ void LSTEvent::createQuintuplets() { quintupletsDC_->const_view(), rangesDC_->const_view()); - WorkDiv1D const addQuintupletRangesToEventExplicit_workDiv = createWorkDiv({1}, {1024}, {1}); + auto const addQuintupletRangesToEventExplicit_workDiv = cms::alpakatools::make_workdiv(1, 1024); alpaka::exec(queue_, addQuintupletRangesToEventExplicit_workDiv, @@ -947,12 +913,9 @@ void LSTEvent::createQuintuplets() { void LSTEvent::pixelLineSegmentCleaning(bool no_pls_dupclean) { if (!no_pls_dupclean) { - Vec3D const threadsPerBlockCheckHitspLS{1, 16, 16}; - Vec3D const blocksPerGridCheckHitspLS{1, max_blocks * 4, max_blocks / 4}; - WorkDiv3D const checkHitspLS_workDiv = - createWorkDiv(blocksPerGridCheckHitspLS, threadsPerBlockCheckHitspLS, elementsPerThread); + auto const checkHitspLS_workDiv = cms::alpakatools::make_workdiv({max_blocks * 4, max_blocks / 4}, {16, 16}); - alpaka::exec(queue_, + alpaka::exec(queue_, checkHitspLS_workDiv, CheckHitspLS{}, modules_.const_view(), @@ -1057,10 +1020,8 @@ void LSTEvent::createPixelQuintuplets() { alpaka::memcpy(queue_, connectedPixelSize_dev_buf, connectedPixelSize_host_buf, nInnerSegments); alpaka::memcpy(queue_, connectedPixelIndex_dev_buf, connectedPixelIndex_host_buf, nInnerSegments); - Vec3D const threadsPerBlockCreatePixQuints{1, 16, 16}; - Vec3D const blocksPerGridCreatePixQuints{16, max_blocks, 1}; - WorkDiv3D const createPixelQuintupletsFromMap_workDiv = - createWorkDiv(blocksPerGridCreatePixQuints, threadsPerBlockCreatePixQuints, elementsPerThread); + auto const createPixelQuintupletsFromMap_workDiv = + cms::alpakatools::make_workdiv({16, max_blocks, 1}, {1, 16, 16}); alpaka::exec(queue_, createPixelQuintupletsFromMap_workDiv, @@ -1080,17 +1041,15 @@ void LSTEvent::createPixelQuintuplets() { rangesDC_->const_view(), ptCut_); - Vec3D const threadsPerBlockDupPix{1, 16, 16}; - Vec3D const blocksPerGridDupPix{1, max_blocks, 1}; - WorkDiv3D const removeDupPixelQuintupletsFromMap_workDiv = - createWorkDiv(blocksPerGridDupPix, threadsPerBlockDupPix, elementsPerThread); + auto const removeDupPixelQuintupletsFromMap_workDiv = + cms::alpakatools::make_workdiv({max_blocks, 1}, {16, 16}); - alpaka::exec(queue_, + alpaka::exec(queue_, removeDupPixelQuintupletsFromMap_workDiv, RemoveDupPixelQuintupletsFromMap{}, pixelQuintupletsDC_->view()); - WorkDiv1D const addpT5asTrackCandidate_workDiv = createWorkDiv({1}, {256}, {1}); + auto const addpT5asTrackCandidate_workDiv = cms::alpakatools::make_workdiv(1, 256); alpaka::exec(queue_, addpT5asTrackCandidate_workDiv, diff --git a/RecoTracker/LSTCore/src/alpaka/LSTEvent.h b/RecoTracker/LSTCore/src/alpaka/LSTEvent.h index a883436a11266..7b36c011265a8 100644 --- a/RecoTracker/LSTCore/src/alpaka/LSTEvent.h +++ b/RecoTracker/LSTCore/src/alpaka/LSTEvent.h @@ -15,6 +15,7 @@ #include "RecoTracker/LSTCore/interface/ModulesHostCollection.h" #include "RecoTracker/LSTCore/interface/alpaka/Common.h" #include "RecoTracker/LSTCore/interface/alpaka/LST.h" +#include "RecoTracker/LSTCore/interface/alpaka/HitsDeviceCollection.h" #include "RecoTracker/LSTCore/interface/alpaka/MiniDoubletsDeviceCollection.h" #include "RecoTracker/LSTCore/interface/alpaka/PixelQuintupletsDeviceCollection.h" #include "RecoTracker/LSTCore/interface/alpaka/PixelTripletsDeviceCollection.h" @@ -26,9 +27,6 @@ #include "RecoTracker/LSTCore/interface/alpaka/ObjectRangesDeviceCollection.h" #include "RecoTracker/LSTCore/interface/alpaka/EndcapGeometryDevDeviceCollection.h" -#include "Hit.h" -#include "Kernels.h" - #include "HeterogeneousCore/AlpakaInterface/interface/host.h" namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { diff --git a/RecoTracker/LSTCore/src/alpaka/MiniDoublet.h b/RecoTracker/LSTCore/src/alpaka/MiniDoublet.h index 4255d651dbce2..cb9d3e50e955b 100644 --- a/RecoTracker/LSTCore/src/alpaka/MiniDoublet.h +++ b/RecoTracker/LSTCore/src/alpaka/MiniDoublet.h @@ -2,6 +2,7 @@ #define RecoTracker_LSTCore_src_alpaka_MiniDoublet_h #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" +#include "FWCore/Utilities/interface/isFinite.h" #include "RecoTracker/LSTCore/interface/alpaka/Common.h" #include "RecoTracker/LSTCore/interface/MiniDoubletsSoA.h" @@ -299,7 +300,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { drprime = (moduleSeparation / alpaka::math::sin(acc, angleA + angleB)) * alpaka::math::sin(acc, angleA); // Compute arctan of the slope and take care of the slope = infinity case - absArctanSlope = ((slope != kVerticalModuleSlope) ? fabs(alpaka::math::atan(acc, slope)) : kPi / 2.f); + absArctanSlope = + ((slope != kVerticalModuleSlope && edm::isFinite(slope)) ? fabs(alpaka::math::atan(acc, slope)) : kPi / 2.f); // Depending on which quadrant the pixel hit lies, we define the angleM by shifting them slightly differently if (xp > 0 and yp > 0) { @@ -322,8 +324,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { ya = yp + drprime_y; // Compute the new strip hit position (if the slope value is in special condition take care of the exceptions) - if (slope == - kVerticalModuleSlope) // Designated for tilted module when the slope is infinity (module lying along y-axis) + if (slope == kVerticalModuleSlope || + edm::isNotFinite(slope)) // Designated for tilted module when the slope is infinity (module lying along y-axis) { xn = xa; // New x point is simply where the anchor is yn = yo; // No shift in y @@ -673,8 +675,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { } struct CreateMiniDoublets { - template - ALPAKA_FN_ACC void operator()(TAcc const& acc, + ALPAKA_FN_ACC void operator()(Acc2D const& acc, ModulesConst modules, HitsConst hits, HitsRangesConst hitsRanges, @@ -682,11 +683,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { MiniDoubletsOccupancy mdsOccupancy, ObjectRangesConst ranges, const float ptCut) const { - auto const globalThreadIdx = alpaka::getIdx(acc); - auto const gridThreadExtent = alpaka::getWorkDiv(acc); - - for (uint16_t lowerModuleIndex = globalThreadIdx[1]; lowerModuleIndex < modules.nLowerModules(); - lowerModuleIndex += gridThreadExtent[1]) { + for (uint16_t lowerModuleIndex : cms::alpakatools::uniform_elements_y(acc, modules.nLowerModules())) { uint16_t upperModuleIndex = modules.partnerModuleIndices()[lowerModuleIndex]; int nLowerHits = hitsRanges.hitRangesnLower()[lowerModuleIndex]; int nUpperHits = hitsRanges.hitRangesnUpper()[lowerModuleIndex]; @@ -696,7 +693,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { unsigned int loHitArrayIndex = hitsRanges.hitRangesLower()[lowerModuleIndex]; int limit = nUpperHits * nLowerHits; - for (int hitIndex = globalThreadIdx[2]; hitIndex < limit; hitIndex += gridThreadExtent[2]) { + for (int hitIndex : cms::alpakatools::uniform_elements_x(acc, limit)) { int lowerHitIndex = hitIndex / nUpperHits; int upperHitIndex = hitIndex % nUpperHits; if (upperHitIndex >= nUpperHits) @@ -804,15 +801,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { } struct CreateMDArrayRangesGPU { - template - ALPAKA_FN_ACC void operator()(TAcc const& acc, ModulesConst modules, ObjectRanges ranges, const float ptCut) const { + ALPAKA_FN_ACC void operator()(Acc1D const& acc, ModulesConst modules, ObjectRanges ranges, const float ptCut) const { // implementation is 1D with a single block - static_assert(std::is_same_v, "Should be Acc1D"); ALPAKA_ASSERT_ACC((alpaka::getWorkDiv(acc)[0] == 1)); - auto const globalThreadIdx = alpaka::getIdx(acc); - auto const gridThreadExtent = alpaka::getWorkDiv(acc); - // Declare variables in shared memory and set to 0 int& nTotalMDs = alpaka::declareSharedVar(acc); if (cms::alpakatools::once_per_block(acc)) { @@ -839,7 +831,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { // Select the appropriate occupancy matrix based on ptCut const auto& occupancy_matrix = (ptCut < 0.8f) ? p06_occupancy_matrix : p08_occupancy_matrix; - for (uint16_t i = globalThreadIdx[0]; i < modules.nLowerModules(); i += gridThreadExtent[0]) { + for (uint16_t i : cms::alpakatools::uniform_elements(acc, modules.nLowerModules())) { short module_rings = modules.rings()[i]; short module_layers = modules.layers()[i]; short module_subdets = modules.subdets()[i]; @@ -874,20 +866,15 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { }; struct AddMiniDoubletRangesToEventExplicit { - template - ALPAKA_FN_ACC void operator()(TAcc const& acc, + ALPAKA_FN_ACC void operator()(Acc1D const& acc, ModulesConst modules, MiniDoubletsOccupancy mdsOccupancy, ObjectRanges ranges, HitsRangesConst hitsRanges) const { // implementation is 1D with a single block - static_assert(std::is_same_v, "Should be Acc1D"); ALPAKA_ASSERT_ACC((alpaka::getWorkDiv(acc)[0] == 1)); - auto const globalThreadIdx = alpaka::getIdx(acc); - auto const gridThreadExtent = alpaka::getWorkDiv(acc); - - for (uint16_t i = globalThreadIdx[0]; i < modules.nLowerModules(); i += gridThreadExtent[0]) { + for (uint16_t i : cms::alpakatools::uniform_elements(acc, modules.nLowerModules())) { if (mdsOccupancy.nMDs()[i] == 0 or hitsRanges.hitRanges()[i][0] == -1) { ranges.mdRanges()[i][0] = -1; ranges.mdRanges()[i][1] = -1; diff --git a/RecoTracker/LSTCore/src/alpaka/PixelQuintuplet.h b/RecoTracker/LSTCore/src/alpaka/PixelQuintuplet.h index 08b7d7d414bf7..8408fb9ca0bd8 100644 --- a/RecoTracker/LSTCore/src/alpaka/PixelQuintuplet.h +++ b/RecoTracker/LSTCore/src/alpaka/PixelQuintuplet.h @@ -634,8 +634,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { } struct CreatePixelQuintupletsFromMap { - template - ALPAKA_FN_ACC void operator()(TAcc const& acc, + ALPAKA_FN_ACC void operator()(Acc3D const& acc, ModulesConst modules, ModulesPixelConst modulesPixel, MiniDoubletsConst mds, @@ -650,15 +649,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { unsigned int nPixelSegments, ObjectRangesConst ranges, const float ptCut) const { - auto const globalBlockIdx = alpaka::getIdx(acc); - auto const globalThreadIdx = alpaka::getIdx(acc); - auto const gridBlockExtent = alpaka::getWorkDiv(acc); - auto const gridThreadExtent = alpaka::getWorkDiv(acc); - - for (unsigned int i_pLS = globalThreadIdx[1]; i_pLS < nPixelSegments; i_pLS += gridThreadExtent[1]) { + for (unsigned int i_pLS : cms::alpakatools::uniform_elements_y(acc, nPixelSegments)) { auto iLSModule_max = connectedPixelIndex[i_pLS] + connectedPixelSize[i_pLS]; - for (unsigned int iLSModule = connectedPixelIndex[i_pLS] + globalBlockIdx[0]; iLSModule < iLSModule_max; - iLSModule += gridBlockExtent[0]) { + for (unsigned int iLSModule : + cms::alpakatools::uniform_elements_z(acc, connectedPixelIndex[i_pLS], iLSModule_max)) { //these are actual module indices uint16_t quintupletLowerModuleIndex = modulesPixel.connectedPixels()[iLSModule]; if (quintupletLowerModuleIndex >= modules.nLowerModules()) @@ -676,9 +670,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { unsigned int pixelSegmentIndex = ranges.segmentModuleIndices()[pixelModuleIndex] + i_pLS; //fetch the quintuplet - for (unsigned int outerQuintupletArrayIndex = globalThreadIdx[2]; - outerQuintupletArrayIndex < nOuterQuintuplets; - outerQuintupletArrayIndex += gridThreadExtent[2]) { + for (unsigned int outerQuintupletArrayIndex : cms::alpakatools::uniform_elements_x(acc, nOuterQuintuplets)) { unsigned int quintupletIndex = ranges.quintupletModuleIndices()[quintupletLowerModuleIndex] + outerQuintupletArrayIndex; diff --git a/RecoTracker/LSTCore/src/alpaka/PixelTriplet.h b/RecoTracker/LSTCore/src/alpaka/PixelTriplet.h index cfd1b3d9b5a5c..c0e132c6ddb5d 100644 --- a/RecoTracker/LSTCore/src/alpaka/PixelTriplet.h +++ b/RecoTracker/LSTCore/src/alpaka/PixelTriplet.h @@ -786,8 +786,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { } struct CreatePixelTripletsFromMap { - template - ALPAKA_FN_ACC void operator()(TAcc const& acc, + ALPAKA_FN_ACC void operator()(Acc3D const& acc, ModulesConst modules, ModulesPixelConst modulesPixel, ObjectRangesConst ranges, @@ -801,16 +800,11 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { unsigned int* connectedPixelIndex, unsigned int nPixelSegments, const float ptCut) const { - auto const globalBlockIdx = alpaka::getIdx(acc); - auto const globalThreadIdx = alpaka::getIdx(acc); - auto const gridBlockExtent = alpaka::getWorkDiv(acc); - auto const gridThreadExtent = alpaka::getWorkDiv(acc); - - for (unsigned int i_pLS = globalThreadIdx[1]; i_pLS < nPixelSegments; i_pLS += gridThreadExtent[1]) { + for (unsigned int i_pLS : cms::alpakatools::uniform_elements_y(acc, nPixelSegments)) { auto iLSModule_max = connectedPixelIndex[i_pLS] + connectedPixelSize[i_pLS]; - for (unsigned int iLSModule = connectedPixelIndex[i_pLS] + globalBlockIdx[0]; iLSModule < iLSModule_max; - iLSModule += gridBlockExtent[0]) { + for (unsigned int iLSModule : + cms::alpakatools::uniform_elements_z(acc, connectedPixelIndex[i_pLS], iLSModule_max)) { uint16_t tripletLowerModuleIndex = modulesPixel.connectedPixels() [iLSModule]; //connected pixels will have the appropriate lower module index by default! @@ -850,8 +844,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { } //fetch the triplet - for (unsigned int outerTripletArrayIndex = globalThreadIdx[2]; outerTripletArrayIndex < nOuterTriplets; - outerTripletArrayIndex += gridThreadExtent[2]) { + for (unsigned int outerTripletArrayIndex : cms::alpakatools::uniform_elements_x(acc, nOuterTriplets)) { unsigned int outerTripletIndex = ranges.tripletModuleIndices()[tripletLowerModuleIndex] + outerTripletArrayIndex; if (modules.moduleType()[triplets.lowerModuleIndices()[outerTripletIndex][1]] == TwoS) diff --git a/RecoTracker/LSTCore/src/alpaka/Quintuplet.h b/RecoTracker/LSTCore/src/alpaka/Quintuplet.h index 8bb9a16a5a38b..ef14d8fd92a8e 100644 --- a/RecoTracker/LSTCore/src/alpaka/Quintuplet.h +++ b/RecoTracker/LSTCore/src/alpaka/Quintuplet.h @@ -657,8 +657,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { // Computing sigmas is a very tricky affair // if the module is tilted or endcap, we need to use the slopes properly! - absArctanSlope = ((slopes[i] != kVerticalModuleSlope) ? alpaka::math::abs(acc, alpaka::math::atan(acc, slopes[i])) - : kPi / 2.f); + absArctanSlope = ((slopes[i] != kVerticalModuleSlope && edm::isFinite(slopes[i])) + ? alpaka::math::abs(acc, alpaka::math::atan(acc, slopes[i])) + : kPi / 2.f); if (xs[i] > 0 and ys[i] > 0) { angleM = kPi / 2.f - absArctanSlope; @@ -740,8 +741,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { float chiSquared = 0.f; float absArctanSlope, angleM, xPrime, yPrime, sigma2; for (size_t i = 0; i < nPoints; i++) { - absArctanSlope = ((slopes[i] != kVerticalModuleSlope) ? alpaka::math::abs(acc, alpaka::math::atan(acc, slopes[i])) - : kPi / 2.f); + absArctanSlope = ((slopes[i] != kVerticalModuleSlope && edm::isFinite(slopes[i])) + ? alpaka::math::abs(acc, alpaka::math::atan(acc, slopes[i])) + : kPi / 2.f); if (xs[i] > 0 and ys[i] > 0) { angleM = kPi / 2.f - absArctanSlope; } else if (xs[i] < 0 and ys[i] > 0) { @@ -1640,8 +1642,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { } struct CreateQuintuplets { - template - ALPAKA_FN_ACC void operator()(TAcc const& acc, + ALPAKA_FN_ACC void operator()(Acc3D const& acc, ModulesConst modules, MiniDoubletsConst mds, SegmentsConst segments, @@ -1652,10 +1653,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { ObjectRangesConst ranges, uint16_t nEligibleT5Modules, const float ptCut) const { - auto const globalThreadIdx = alpaka::getIdx(acc); - auto const gridThreadExtent = alpaka::getWorkDiv(acc); - - for (int iter = globalThreadIdx[0]; iter < nEligibleT5Modules; iter += gridThreadExtent[0]) { + for (int iter : cms::alpakatools::uniform_elements_z(acc, nEligibleT5Modules)) { uint16_t lowerModule1 = ranges.indicesOfEligibleT5Modules()[iter]; short layer2_adjustment; int layer = modules.layers()[lowerModule1]; @@ -1669,14 +1667,12 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { continue; } unsigned int nInnerTriplets = tripletsOccupancy.nTriplets()[lowerModule1]; - for (unsigned int innerTripletArrayIndex = globalThreadIdx[1]; innerTripletArrayIndex < nInnerTriplets; - innerTripletArrayIndex += gridThreadExtent[1]) { + for (unsigned int innerTripletArrayIndex : cms::alpakatools::uniform_elements_y(acc, nInnerTriplets)) { unsigned int innerTripletIndex = ranges.tripletModuleIndices()[lowerModule1] + innerTripletArrayIndex; uint16_t lowerModule2 = triplets.lowerModuleIndices()[innerTripletIndex][1]; uint16_t lowerModule3 = triplets.lowerModuleIndices()[innerTripletIndex][2]; unsigned int nOuterTriplets = tripletsOccupancy.nTriplets()[lowerModule3]; - for (unsigned int outerTripletArrayIndex = globalThreadIdx[2]; outerTripletArrayIndex < nOuterTriplets; - outerTripletArrayIndex += gridThreadExtent[2]) { + for (unsigned int outerTripletArrayIndex : cms::alpakatools::uniform_elements_x(acc, nOuterTriplets)) { unsigned int outerTripletIndex = ranges.tripletModuleIndices()[lowerModule3] + outerTripletArrayIndex; uint16_t lowerModule4 = triplets.lowerModuleIndices()[outerTripletIndex][1]; uint16_t lowerModule5 = triplets.lowerModuleIndices()[outerTripletIndex][2]; @@ -1776,19 +1772,14 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { }; struct CreateEligibleModulesListForQuintuplets { - template - ALPAKA_FN_ACC void operator()(TAcc const& acc, + ALPAKA_FN_ACC void operator()(Acc1D const& acc, ModulesConst modules, TripletsOccupancyConst tripletsOccupancy, ObjectRanges ranges, const float ptCut) const { // implementation is 1D with a single block - static_assert(std::is_same_v, "Should be Acc1D"); ALPAKA_ASSERT_ACC((alpaka::getWorkDiv(acc)[0] == 1)); - auto const globalThreadIdx = alpaka::getIdx(acc); - auto const gridThreadExtent = alpaka::getWorkDiv(acc); - // Initialize variables in shared memory and set to 0 int& nEligibleT5Modulesx = alpaka::declareSharedVar(acc); int& nTotalQuintupletsx = alpaka::declareSharedVar(acc); @@ -1817,7 +1808,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { // Select the appropriate occupancy matrix based on ptCut const auto& occupancy_matrix = (ptCut < 0.8f) ? p06_occupancy_matrix : p08_occupancy_matrix; - for (int i = globalThreadIdx[0]; i < modules.nLowerModules(); i += gridThreadExtent[0]) { + for (int i : cms::alpakatools::uniform_elements(acc, modules.nLowerModules())) { // Condition for a quintuple to exist for a module // TCs don't exist for layers 5 and 6 barrel, and layers 2,3,4,5 endcap short module_rings = modules.rings()[i]; @@ -1863,19 +1854,14 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { }; struct AddQuintupletRangesToEventExplicit { - template - ALPAKA_FN_ACC void operator()(TAcc const& acc, + ALPAKA_FN_ACC void operator()(Acc1D const& acc, ModulesConst modules, QuintupletsOccupancyConst quintupletsOccupancy, ObjectRanges ranges) const { // implementation is 1D with a single block - static_assert(std::is_same_v, "Should be Acc1D"); ALPAKA_ASSERT_ACC((alpaka::getWorkDiv(acc)[0] == 1)); - auto const globalThreadIdx = alpaka::getIdx(acc); - auto const gridThreadExtent = alpaka::getWorkDiv(acc); - - for (uint16_t i = globalThreadIdx[0]; i < modules.nLowerModules(); i += gridThreadExtent[0]) { + for (uint16_t i : cms::alpakatools::uniform_elements(acc, modules.nLowerModules())) { if (quintupletsOccupancy.nQuintuplets()[i] == 0 or ranges.quintupletModuleIndices()[i] == -1) { ranges.quintupletRanges()[i][0] = -1; ranges.quintupletRanges()[i][1] = -1; diff --git a/RecoTracker/LSTCore/src/alpaka/Segment.h b/RecoTracker/LSTCore/src/alpaka/Segment.h index 911119bf67ff8..3309100529900 100644 --- a/RecoTracker/LSTCore/src/alpaka/Segment.h +++ b/RecoTracker/LSTCore/src/alpaka/Segment.h @@ -1,6 +1,8 @@ #ifndef RecoTracker_LSTCore_src_alpaka_Segment_h #define RecoTracker_LSTCore_src_alpaka_Segment_h +#include + #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" #include "RecoTracker/LSTCore/interface/alpaka/Common.h" @@ -246,7 +248,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { mds.anchorY()[innerMDIndex] + circleRadius * alpaka::math::cos(acc, circlePhi)}; //check which of the circles can accommodate r3LH better (we won't get perfect agreement) - float bestChiSquared = kVerticalModuleSlope; + float bestChiSquared = std::numeric_limits::infinity(); float chiSquared; size_t bestIndex; for (size_t i = 0; i < 2; i++) { @@ -523,8 +525,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { } struct CreateSegments { - template - ALPAKA_FN_ACC void operator()(TAcc const& acc, + ALPAKA_FN_ACC void operator()(Acc3D const& acc, ModulesConst modules, MiniDoubletsConst mds, MiniDoubletsOccupancyConst mdsOccupancy, @@ -532,21 +533,14 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { SegmentsOccupancy segmentsOccupancy, ObjectRangesConst ranges, const float ptCut) const { - auto const globalBlockIdx = alpaka::getIdx(acc); - auto const blockThreadIdx = alpaka::getIdx(acc); - auto const gridBlockExtent = alpaka::getWorkDiv(acc); - auto const blockThreadExtent = alpaka::getWorkDiv(acc); - - for (uint16_t innerLowerModuleIndex = globalBlockIdx[2]; innerLowerModuleIndex < modules.nLowerModules(); - innerLowerModuleIndex += gridBlockExtent[2]) { + for (uint16_t innerLowerModuleIndex : cms::alpakatools::uniform_elements_z(acc, modules.nLowerModules())) { unsigned int nInnerMDs = mdsOccupancy.nMDs()[innerLowerModuleIndex]; if (nInnerMDs == 0) continue; unsigned int nConnectedModules = modules.nConnectedModules()[innerLowerModuleIndex]; - for (uint16_t outerLowerModuleArrayIdx = blockThreadIdx[1]; outerLowerModuleArrayIdx < nConnectedModules; - outerLowerModuleArrayIdx += blockThreadExtent[1]) { + for (uint16_t outerLowerModuleArrayIdx : cms::alpakatools::uniform_elements_y(acc, nConnectedModules)) { uint16_t outerLowerModuleIndex = modules.moduleMap()[innerLowerModuleIndex][outerLowerModuleArrayIdx]; unsigned int nOuterMDs = mdsOccupancy.nMDs()[outerLowerModuleIndex]; @@ -555,7 +549,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { if (limit == 0) continue; - for (unsigned int hitIndex = blockThreadIdx[2]; hitIndex < limit; hitIndex += blockThreadExtent[2]) { + for (unsigned int hitIndex : cms::alpakatools::uniform_elements_x(acc, limit)) { unsigned int innerMDArrayIdx = hitIndex / nOuterMDs; unsigned int outerMDArrayIdx = hitIndex % nOuterMDs; if (outerMDArrayIdx >= nOuterMDs) @@ -625,16 +619,11 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { }; struct CreateSegmentArrayRanges { - template ALPAKA_FN_ACC void operator()( - TAcc const& acc, ModulesConst modules, ObjectRanges ranges, MiniDoubletsConst mds, const float ptCut) const { + Acc1D const& acc, ModulesConst modules, ObjectRanges ranges, MiniDoubletsConst mds, const float ptCut) const { // implementation is 1D with a single block - static_assert(std::is_same_v, "Should be Acc1D"); ALPAKA_ASSERT_ACC((alpaka::getWorkDiv(acc)[0] == 1)); - auto const globalThreadIdx = alpaka::getIdx(acc); - auto const gridThreadExtent = alpaka::getWorkDiv(acc); - // Initialize variables in shared memory and set to 0 int& nTotalSegments = alpaka::declareSharedVar(acc); if (cms::alpakatools::once_per_block(acc)) { @@ -661,7 +650,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { // Select the appropriate occupancy matrix based on ptCut const auto& occupancy_matrix = (ptCut < 0.8f) ? p06_occupancy_matrix : p08_occupancy_matrix; - for (uint16_t i = globalThreadIdx[0]; i < modules.nLowerModules(); i += gridThreadExtent[0]) { + for (uint16_t i : cms::alpakatools::uniform_elements(acc, modules.nLowerModules())) { if (modules.nConnectedModules()[i] == 0) { ranges.segmentModuleIndices()[i] = nTotalSegments; ranges.segmentModuleOccupancy()[i] = 0; @@ -701,19 +690,14 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { }; struct AddSegmentRangesToEventExplicit { - template - ALPAKA_FN_ACC void operator()(TAcc const& acc, + ALPAKA_FN_ACC void operator()(Acc1D const& acc, ModulesConst modules, SegmentsOccupancyConst segmentsOccupancy, ObjectRanges ranges) const { // implementation is 1D with a single block - static_assert(std::is_same_v, "Should be Acc1D"); ALPAKA_ASSERT_ACC((alpaka::getWorkDiv(acc)[0] == 1)); - auto const globalThreadIdx = alpaka::getIdx(acc); - auto const gridThreadExtent = alpaka::getWorkDiv(acc); - - for (uint16_t i = globalThreadIdx[0]; i < modules.nLowerModules(); i += gridThreadExtent[0]) { + for (uint16_t i : cms::alpakatools::uniform_elements(acc, modules.nLowerModules())) { if (segmentsOccupancy.nSegments()[i] == 0) { ranges.segmentRanges()[i][0] = -1; ranges.segmentRanges()[i][1] = -1; @@ -726,8 +710,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { }; struct AddPixelSegmentToEventKernel { - template - ALPAKA_FN_ACC void operator()(TAcc const& acc, + ALPAKA_FN_ACC void operator()(Acc1D const& acc, ModulesConst modules, ObjectRangesConst ranges, HitsConst hits, @@ -741,10 +724,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { float* dPhiChange, uint16_t pixelModuleIndex, int size) const { - auto const globalThreadIdx = alpaka::getIdx(acc); - auto const gridThreadExtent = alpaka::getWorkDiv(acc); - - for (int tid = globalThreadIdx[2]; tid < size; tid += gridThreadExtent[2]) { + for (int tid : cms::alpakatools::uniform_elements(acc, size)) { unsigned int innerMDIndex = ranges.miniDoubletModuleIndices()[pixelModuleIndex] + 2 * (tid); unsigned int outerMDIndex = ranges.miniDoubletModuleIndices()[pixelModuleIndex] + 2 * (tid) + 1; unsigned int pixelSegmentIndex = ranges.segmentModuleIndices()[pixelModuleIndex] + tid; diff --git a/RecoTracker/LSTCore/src/alpaka/TrackCandidate.h b/RecoTracker/LSTCore/src/alpaka/TrackCandidate.h index 315263919aa87..7f016af17199c 100644 --- a/RecoTracker/LSTCore/src/alpaka/TrackCandidate.h +++ b/RecoTracker/LSTCore/src/alpaka/TrackCandidate.h @@ -1,6 +1,8 @@ #ifndef RecoTracker_LSTCore_src_alpaka_TrackCandidate_h #define RecoTracker_LSTCore_src_alpaka_TrackCandidate_h +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" + #include "RecoTracker/LSTCore/interface/alpaka/Common.h" #include "RecoTracker/LSTCore/interface/ModulesSoA.h" #include "RecoTracker/LSTCore/interface/MiniDoubletsSoA.h" @@ -106,19 +108,14 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { } struct CrossCleanpT3 { - template - ALPAKA_FN_ACC void operator()(TAcc const& acc, + ALPAKA_FN_ACC void operator()(Acc2D const& acc, ModulesConst modules, ObjectRangesConst ranges, PixelTriplets pixelTriplets, SegmentsPixelConst segmentsPixel, PixelQuintupletsConst pixelQuintuplets) const { - auto const globalThreadIdx = alpaka::getIdx(acc); - auto const gridThreadExtent = alpaka::getWorkDiv(acc); - unsigned int nPixelTriplets = pixelTriplets.nPixelTriplets(); - for (unsigned int pixelTripletIndex = globalThreadIdx[2]; pixelTripletIndex < nPixelTriplets; - pixelTripletIndex += gridThreadExtent[2]) { + for (unsigned int pixelTripletIndex : cms::alpakatools::uniform_elements_y(acc, nPixelTriplets)) { if (pixelTriplets.isDup()[pixelTripletIndex]) continue; @@ -130,8 +127,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { unsigned int prefix = ranges.segmentModuleIndices()[pixelModuleIndex]; unsigned int nPixelQuintuplets = pixelQuintuplets.nPixelQuintuplets(); - for (unsigned int pixelQuintupletIndex = globalThreadIdx[1]; pixelQuintupletIndex < nPixelQuintuplets; - pixelQuintupletIndex += gridThreadExtent[1]) { + for (unsigned int pixelQuintupletIndex : cms::alpakatools::uniform_elements_x(acc, nPixelQuintuplets)) { unsigned int pLS_jx = pixelQuintuplets.pixelSegmentIndices()[pixelQuintupletIndex]; float eta2 = segmentsPixel.eta()[pLS_jx - prefix]; float phi2 = segmentsPixel.phi()[pLS_jx - prefix]; @@ -147,26 +143,20 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { }; struct CrossCleanT5 { - template - ALPAKA_FN_ACC void operator()(TAcc const& acc, + ALPAKA_FN_ACC void operator()(Acc3D const& acc, ModulesConst modules, Quintuplets quintuplets, QuintupletsOccupancyConst quintupletsOccupancy, PixelQuintupletsConst pixelQuintuplets, PixelTripletsConst pixelTriplets, ObjectRangesConst ranges) const { - auto const globalThreadIdx = alpaka::getIdx(acc); - auto const gridThreadExtent = alpaka::getWorkDiv(acc); - - for (int innerInnerInnerLowerModuleArrayIndex = globalThreadIdx[0]; - innerInnerInnerLowerModuleArrayIndex < modules.nLowerModules(); - innerInnerInnerLowerModuleArrayIndex += gridThreadExtent[0]) { + for (int innerInnerInnerLowerModuleArrayIndex : + cms::alpakatools::uniform_elements_z(acc, modules.nLowerModules())) { if (ranges.quintupletModuleIndices()[innerInnerInnerLowerModuleArrayIndex] == -1) continue; unsigned int nQuints = quintupletsOccupancy.nQuintuplets()[innerInnerInnerLowerModuleArrayIndex]; - for (unsigned int innerObjectArrayIndex = globalThreadIdx[1]; innerObjectArrayIndex < nQuints; - innerObjectArrayIndex += gridThreadExtent[1]) { + for (unsigned int innerObjectArrayIndex : cms::alpakatools::uniform_elements_y(acc, nQuints)) { unsigned int quintupletIndex = ranges.quintupletModuleIndices()[innerInnerInnerLowerModuleArrayIndex] + innerObjectArrayIndex; @@ -178,7 +168,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { float eta1 = __H2F(quintuplets.eta()[quintupletIndex]); float phi1 = __H2F(quintuplets.phi()[quintupletIndex]); - for (unsigned int jx = globalThreadIdx[2]; jx < loop_bound; jx += gridThreadExtent[2]) { + for (unsigned int jx : cms::alpakatools::uniform_elements_x(acc, loop_bound)) { float eta2, phi2; if (jx < pixelQuintuplets.nPixelQuintuplets()) { eta2 = __H2F(pixelQuintuplets.eta()[jx]); @@ -201,8 +191,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { }; struct CrossCleanpLS { - template - ALPAKA_FN_ACC void operator()(TAcc const& acc, + ALPAKA_FN_ACC void operator()(Acc2D const& acc, ModulesConst modules, ObjectRangesConst ranges, PixelTripletsConst pixelTriplets, @@ -213,13 +202,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { MiniDoubletsConst mds, HitsConst hits, QuintupletsConst quintuplets) const { - auto const globalThreadIdx = alpaka::getIdx(acc); - auto const gridThreadExtent = alpaka::getWorkDiv(acc); - int pixelModuleIndex = modules.nLowerModules(); unsigned int nPixels = segmentsOccupancy.nSegments()[pixelModuleIndex]; - for (unsigned int pixelArrayIndex = globalThreadIdx[2]; pixelArrayIndex < nPixels; - pixelArrayIndex += gridThreadExtent[2]) { + for (unsigned int pixelArrayIndex : cms::alpakatools::uniform_elements_y(acc, nPixels)) { if (!segmentsPixel.isQuad()[pixelArrayIndex] || segmentsPixel.isDup()[pixelArrayIndex]) continue; @@ -228,8 +213,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { unsigned int prefix = ranges.segmentModuleIndices()[pixelModuleIndex]; unsigned int nTrackCandidates = cands.nTrackCandidates(); - for (unsigned int trackCandidateIndex = globalThreadIdx[1]; trackCandidateIndex < nTrackCandidates; - trackCandidateIndex += gridThreadExtent[1]) { + for (unsigned int trackCandidateIndex : cms::alpakatools::uniform_elements_x(acc, nTrackCandidates)) { short type = cands.trackCandidateType()[trackCandidateIndex]; unsigned int innerTrackletIdx = cands.objectIndices()[trackCandidateIndex][0]; if (type == LSTObjType::T5) { @@ -281,24 +265,18 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { }; struct AddpT3asTrackCandidates { - template - ALPAKA_FN_ACC void operator()(TAcc const& acc, + ALPAKA_FN_ACC void operator()(Acc1D const& acc, uint16_t nLowerModules, PixelTripletsConst pixelTriplets, TrackCandidates cands, SegmentsPixelConst segmentsPixel, ObjectRangesConst ranges) const { // implementation is 1D with a single block - static_assert(std::is_same_v, "Should be Acc1D"); ALPAKA_ASSERT_ACC((alpaka::getWorkDiv(acc)[0] == 1)); - auto const globalThreadIdx = alpaka::getIdx(acc); - auto const gridThreadExtent = alpaka::getWorkDiv(acc); - unsigned int nPixelTriplets = pixelTriplets.nPixelTriplets(); unsigned int pLS_offset = ranges.segmentModuleIndices()[nLowerModules]; - for (unsigned int pixelTripletIndex = globalThreadIdx[0]; pixelTripletIndex < nPixelTriplets; - pixelTripletIndex += gridThreadExtent[0]) { + for (unsigned int pixelTripletIndex : cms::alpakatools::uniform_elements(acc, nPixelTriplets)) { if ((pixelTriplets.isDup()[pixelTripletIndex])) continue; @@ -337,22 +315,18 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { }; struct AddT5asTrackCandidate { - template - ALPAKA_FN_ACC void operator()(TAcc const& acc, + ALPAKA_FN_ACC void operator()(Acc2D const& acc, uint16_t nLowerModules, QuintupletsConst quintuplets, QuintupletsOccupancyConst quintupletsOccupancy, TrackCandidates cands, ObjectRangesConst ranges) const { - auto const globalThreadIdx = alpaka::getIdx(acc); - auto const gridThreadExtent = alpaka::getWorkDiv(acc); - - for (int idx = globalThreadIdx[1]; idx < nLowerModules; idx += gridThreadExtent[1]) { + for (int idx : cms::alpakatools::uniform_elements_y(acc, nLowerModules)) { if (ranges.quintupletModuleIndices()[idx] == -1) continue; unsigned int nQuints = quintupletsOccupancy.nQuintuplets()[idx]; - for (unsigned int jdx = globalThreadIdx[2]; jdx < nQuints; jdx += gridThreadExtent[2]) { + for (unsigned int jdx : cms::alpakatools::uniform_elements_x(acc, nQuints)) { unsigned int quintupletIndex = ranges.quintupletModuleIndices()[idx] + jdx; if (quintuplets.isDup()[quintupletIndex] or quintuplets.partOfPT5()[quintupletIndex]) continue; @@ -391,19 +365,14 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { }; struct AddpLSasTrackCandidate { - template - ALPAKA_FN_ACC void operator()(TAcc const& acc, + ALPAKA_FN_ACC void operator()(Acc1D const& acc, uint16_t nLowerModules, TrackCandidates cands, SegmentsOccupancyConst segmentsOccupancy, SegmentsPixelConst segmentsPixel, bool tc_pls_triplets) const { - auto const globalThreadIdx = alpaka::getIdx(acc); - auto const gridThreadExtent = alpaka::getWorkDiv(acc); - unsigned int nPixels = segmentsOccupancy.nSegments()[nLowerModules]; - for (unsigned int pixelArrayIndex = globalThreadIdx[2]; pixelArrayIndex < nPixels; - pixelArrayIndex += gridThreadExtent[2]) { + for (unsigned int pixelArrayIndex : cms::alpakatools::uniform_elements(acc, nPixels)) { if ((tc_pls_triplets ? 0 : !segmentsPixel.isQuad()[pixelArrayIndex]) || (segmentsPixel.isDup()[pixelArrayIndex])) continue; @@ -432,24 +401,18 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { }; struct AddpT5asTrackCandidate { - template - ALPAKA_FN_ACC void operator()(TAcc const& acc, + ALPAKA_FN_ACC void operator()(Acc1D const& acc, uint16_t nLowerModules, PixelQuintupletsConst pixelQuintuplets, TrackCandidates cands, SegmentsPixelConst segmentsPixel, ObjectRangesConst ranges) const { // implementation is 1D with a single block - static_assert(std::is_same_v, "Should be Acc1D"); ALPAKA_ASSERT_ACC((alpaka::getWorkDiv(acc)[0] == 1)); - auto const globalThreadIdx = alpaka::getIdx(acc); - auto const gridThreadExtent = alpaka::getWorkDiv(acc); - int nPixelQuintuplets = pixelQuintuplets.nPixelQuintuplets(); unsigned int pLS_offset = ranges.segmentModuleIndices()[nLowerModules]; - for (int pixelQuintupletIndex = globalThreadIdx[0]; pixelQuintupletIndex < nPixelQuintuplets; - pixelQuintupletIndex += gridThreadExtent[0]) { + for (int pixelQuintupletIndex : cms::alpakatools::uniform_elements(acc, nPixelQuintuplets)) { if (pixelQuintuplets.isDup()[pixelQuintupletIndex]) continue; diff --git a/RecoTracker/LSTCore/src/alpaka/Triplet.h b/RecoTracker/LSTCore/src/alpaka/Triplet.h index ae2faecb080a6..e84fa06a2372c 100644 --- a/RecoTracker/LSTCore/src/alpaka/Triplet.h +++ b/RecoTracker/LSTCore/src/alpaka/Triplet.h @@ -734,8 +734,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { } struct CreateTriplets { - template - ALPAKA_FN_ACC void operator()(TAcc const& acc, + ALPAKA_FN_ACC void operator()(Acc3D const& acc, ModulesConst modules, MiniDoubletsConst mds, SegmentsConst segments, @@ -746,11 +745,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { uint16_t* index_gpu, uint16_t nonZeroModules, const float ptCut) const { - auto const globalThreadIdx = alpaka::getIdx(acc); - auto const gridThreadExtent = alpaka::getWorkDiv(acc); - - for (uint16_t innerLowerModuleArrayIdx = globalThreadIdx[0]; innerLowerModuleArrayIdx < nonZeroModules; - innerLowerModuleArrayIdx += gridThreadExtent[0]) { + for (uint16_t innerLowerModuleArrayIdx : cms::alpakatools::uniform_elements_z(acc, nonZeroModules)) { uint16_t innerInnerLowerModuleIndex = index_gpu[innerLowerModuleArrayIdx]; if (innerInnerLowerModuleIndex >= modules.nLowerModules()) continue; @@ -760,8 +755,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { continue; unsigned int nInnerSegments = segmentsOccupancy.nSegments()[innerInnerLowerModuleIndex]; - for (unsigned int innerSegmentArrayIndex = globalThreadIdx[1]; innerSegmentArrayIndex < nInnerSegments; - innerSegmentArrayIndex += gridThreadExtent[1]) { + for (unsigned int innerSegmentArrayIndex : cms::alpakatools::uniform_elements_y(acc, nInnerSegments)) { unsigned int innerSegmentIndex = ranges.segmentRanges()[innerInnerLowerModuleIndex][0] + innerSegmentArrayIndex; @@ -769,8 +763,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { uint16_t middleLowerModuleIndex = segments.outerLowerModuleIndices()[innerSegmentIndex]; unsigned int nOuterSegments = segmentsOccupancy.nSegments()[middleLowerModuleIndex]; - for (unsigned int outerSegmentArrayIndex = globalThreadIdx[2]; outerSegmentArrayIndex < nOuterSegments; - outerSegmentArrayIndex += gridThreadExtent[2]) { + for (unsigned int outerSegmentArrayIndex : cms::alpakatools::uniform_elements_x(acc, nOuterSegments)) { unsigned int outerSegmentIndex = ranges.segmentRanges()[middleLowerModuleIndex][0] + outerSegmentArrayIndex; uint16_t outerOuterLowerModuleIndex = segments.outerLowerModuleIndices()[outerSegmentIndex]; @@ -841,19 +834,14 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { }; struct CreateTripletArrayRanges { - template - ALPAKA_FN_ACC void operator()(TAcc const& acc, + ALPAKA_FN_ACC void operator()(Acc1D const& acc, ModulesConst modules, ObjectRanges ranges, SegmentsOccupancyConst segmentsOccupancy, const float ptCut) const { // implementation is 1D with a single block - static_assert(std::is_same_v, "Should be Acc1D"); ALPAKA_ASSERT_ACC((alpaka::getWorkDiv(acc)[0] == 1)); - auto const globalThreadIdx = alpaka::getIdx(acc); - auto const gridThreadExtent = alpaka::getWorkDiv(acc); - // Initialize variables in shared memory and set to 0 int& nTotalTriplets = alpaka::declareSharedVar(acc); if (cms::alpakatools::once_per_block(acc)) { @@ -880,7 +868,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { // Select the appropriate occupancy matrix based on ptCut const auto& occupancy_matrix = (ptCut < 0.8f) ? p06_occupancy_matrix : p08_occupancy_matrix; - for (uint16_t i = globalThreadIdx[0]; i < modules.nLowerModules(); i += gridThreadExtent[0]) { + for (uint16_t i : cms::alpakatools::uniform_elements(acc, modules.nLowerModules())) { if (segmentsOccupancy.nSegments()[i] == 0) { ranges.tripletModuleIndices()[i] = nTotalTriplets; ranges.tripletModuleOccupancy()[i] = 0; @@ -919,19 +907,14 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { }; struct AddTripletRangesToEventExplicit { - template - ALPAKA_FN_ACC void operator()(TAcc const& acc, + ALPAKA_FN_ACC void operator()(Acc1D const& acc, ModulesConst modules, TripletsOccupancyConst tripletsOccupancy, ObjectRanges ranges) const { // implementation is 1D with a single block - static_assert(std::is_same_v, "Should be Acc1D"); ALPAKA_ASSERT_ACC((alpaka::getWorkDiv(acc)[0] == 1)); - auto const globalThreadIdx = alpaka::getIdx(acc); - auto const gridThreadExtent = alpaka::getWorkDiv(acc); - - for (uint16_t i = globalThreadIdx[0]; i < modules.nLowerModules(); i += gridThreadExtent[0]) { + for (uint16_t i : cms::alpakatools::uniform_elements(acc, modules.nLowerModules())) { if (tripletsOccupancy.nTriplets()[i] == 0) { ranges.tripletRanges()[i][0] = -1; ranges.tripletRanges()[i][1] = -1; diff --git a/RecoTracker/LSTCore/standalone/code/core/write_lst_ntuple.cc b/RecoTracker/LSTCore/standalone/code/core/write_lst_ntuple.cc index deed88f833a00..2f13f3fc4c206 100644 --- a/RecoTracker/LSTCore/standalone/code/core/write_lst_ntuple.cc +++ b/RecoTracker/LSTCore/standalone/code/core/write_lst_ntuple.cc @@ -1,11 +1,41 @@ -// to use computeRadiusFromThreeAnchorHits #include "LSTEvent.h" -#include "Triplet.h" #include "write_lst_ntuple.h" using namespace ALPAKA_ACCELERATOR_NAMESPACE::lst; +// copied from Tripelet.h +float computeRadiusFromThreeAnchorHits(float x1, float y1, float x2, float y2, float x3, float y3, float& g, float& f) { + float radius = 0.f; + + //(g,f) -> center + //first anchor hit - (x1,y1), second anchor hit - (x2,y2), third anchor hit - (x3, y3) + + float denomInv = 1.0f / ((y1 - y3) * (x2 - x3) - (x1 - x3) * (y2 - y3)); + + float xy1sqr = x1 * x1 + y1 * y1; + + float xy2sqr = x2 * x2 + y2 * y2; + + float xy3sqr = x3 * x3 + y3 * y3; + + g = 0.5f * ((y3 - y2) * xy1sqr + (y1 - y3) * xy2sqr + (y2 - y1) * xy3sqr) * denomInv; + + f = 0.5f * ((x2 - x3) * xy1sqr + (x3 - x1) * xy2sqr + (x1 - x2) * xy3sqr) * denomInv; + + float c = ((x2 * y3 - x3 * y2) * xy1sqr + (x3 * y1 - x1 * y3) * xy2sqr + (x1 * y2 - x2 * y1) * xy3sqr) * denomInv; + + if (((y1 - y3) * (x2 - x3) - (x1 - x3) * (y2 - y3) == 0) || (g * g + f * f - c < 0)) { +#ifdef WARNINGS + printf("three collinear points or FATAL! r^2 < 0!\n"); +#endif + radius = -1.f; + } else + radius = sqrt(g * g + f * f - c); + + return radius; +} + //________________________________________________________________________________________________________________________________ void createOutputBranches() { createRequiredOutputBranches(); @@ -673,9 +703,7 @@ void fillT5DNNBranches(LSTEvent* event, unsigned int iT3) { } float g, f; - auto const& devHost = cms::alpakatools::host(); - float radius = computeRadiusFromThreeAnchorHits(devHost, - hitObjects[0].x(), + float radius = computeRadiusFromThreeAnchorHits(hitObjects[0].x(), hitObjects[0].y(), hitObjects[1].x(), hitObjects[1].y(),