From 9d34af4e91f90bc327ec8c9b985a195037cb6c39 Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Tue, 1 Oct 2024 19:57:30 +0200 Subject: [PATCH] Improve scheduling, benchmarking, and config --- .../GPU/DeviceInterface/GPUInterface.h | 1 + Common/DCAFitter/GPU/cuda/CMakeLists.txt | 2 +- Common/DCAFitter/GPU/cuda/DCAFitterN.cu | 72 ++++++++---- Common/DCAFitter/GPU/cuda/GPUInterface.cu | 7 +- .../GPU/cuda/test/testDCAFitterNGPU.cxx | 106 +++++++++--------- Common/DCAFitter/GPU/hip/CMakeLists.txt | 2 +- 6 files changed, 113 insertions(+), 77 deletions(-) diff --git a/Common/DCAFitter/GPU/DeviceInterface/GPUInterface.h b/Common/DCAFitter/GPU/DeviceInterface/GPUInterface.h index effd13021d538..3aa5ead805acd 100644 --- a/Common/DCAFitter/GPU/DeviceInterface/GPUInterface.h +++ b/Common/DCAFitter/GPU/DeviceInterface/GPUInterface.h @@ -17,6 +17,7 @@ #include #include +#include namespace o2 { diff --git a/Common/DCAFitter/GPU/cuda/CMakeLists.txt b/Common/DCAFitter/GPU/cuda/CMakeLists.txt index 42c2b8f0d5059..ddc1d09445d7f 100644 --- a/Common/DCAFitter/GPU/cuda/CMakeLists.txt +++ b/Common/DCAFitter/GPU/cuda/CMakeLists.txt @@ -20,7 +20,7 @@ o2_add_library(DCAFitterCUDA O2::DetectorsBase PRIVATE_LINK_LIBRARIES O2::GPUTrackingCUDAExternalProvider) set_property(TARGET ${targetName} PROPERTY CUDA_SEPARABLE_COMPILATION ON) -add_compile_options(-lineinfo) +# add_compile_options(-lineinfo) o2_add_test(DCAFitterNCUDA SOURCES test/testDCAFitterNGPU.cxx diff --git a/Common/DCAFitter/GPU/cuda/DCAFitterN.cu b/Common/DCAFitter/GPU/cuda/DCAFitterN.cu index 28a969f25dee3..ab53ae25d7548 100644 --- a/Common/DCAFitter/GPU/cuda/DCAFitterN.cu +++ b/Common/DCAFitter/GPU/cuda/DCAFitterN.cu @@ -15,6 +15,8 @@ #include #endif +#include + #include "GPUCommonDef.h" #include "DCAFitter/DCAFitterN.h" #include "DeviceInterface/GPUInterface.h" @@ -133,23 +135,25 @@ int process(const int nBlocks, template void processBulk(const int nBlocks, const int nThreads, - const int nStreams, + const int nBatches, std::vector& fitters, std::vector& results, std::vector&... args) { auto* gpuInterface = GPUInterface::Instance(); - kernel::warmUpGpuKernel<<<1, 1>>>(); + kernel::warmUpGpuKernel<<<1, 1, 0, gpuInterface->getNextStream()>>>(); // Benchmarking events - // std::vector start(nStreams), stop(nStreams); - // cudaEvent_t totalStart, totalStop; - // gpuCheckError(cudaEventCreate(&totalStart)); - // gpuCheckError(cudaEventCreate(&totalStop)); - // for (int iBatch{0}; iBatch < nStreams; ++iBatch) { - // gpuCheckError(cudaEventCreate(&start[iBatch])); - // gpuCheckError(cudaEventCreate(&stop[iBatch])); - // } + std::vector ioUp(nBatches), ioDown(nBatches), kerElapsed(nBatches); + std::vector startIOUp(nBatches), endIOUp(nBatches), startIODown(nBatches), endIODown(nBatches), startKer(nBatches), endKer(nBatches); + for (int iBatch{0}; iBatch < nBatches; ++iBatch) { + gpuCheckError(cudaEventCreate(&startIOUp[iBatch])); + gpuCheckError(cudaEventCreate(&endIOUp[iBatch])); + gpuCheckError(cudaEventCreate(&startIODown[iBatch])); + gpuCheckError(cudaEventCreate(&endIODown[iBatch])); + gpuCheckError(cudaEventCreate(&startKer[iBatch])); + gpuCheckError(cudaEventCreate(&endKer[iBatch])); + } // Tracks std::array tracks_device; @@ -171,16 +175,17 @@ void processBulk(const int nBlocks, int* results_device; gpuInterface->allocDevice(reinterpret_cast(&results_device), sizeof(int) * fitters.size()); - // gpuCheckError(cudaEventRecord(totalStart)); + // R.R. Computation int totalSize = fitters.size(); - int batchSize = totalSize / nStreams; - int remainder = totalSize % nStreams; + int batchSize = totalSize / nBatches; + int remainder = totalSize % nBatches; - for (int iBatch{0}; iBatch < nStreams; ++iBatch) { + for (int iBatch{0}; iBatch < nBatches; ++iBatch) { auto& stream = gpuInterface->getNextStream(); auto offset = iBatch * batchSize + std::min(iBatch, remainder); auto nFits = batchSize + (iBatch < remainder ? 1 : 0); + gpuCheckError(cudaEventRecord(startIOUp[iBatch], stream)); gpuCheckError(cudaMemcpyAsync(fitters_device + offset, fitters.data() + offset, sizeof(Fitter) * nFits, cudaMemcpyHostToDevice, stream)); iArg = 0; ([&] { @@ -188,23 +193,27 @@ void processBulk(const int nBlocks, ++iArg; }(), ...); - // gpuCheckError(cudaEventRecord(start[iBatch])); + gpuCheckError(cudaEventRecord(endIOUp[iBatch], stream)); + + gpuCheckError(cudaEventRecord(startKer[iBatch], stream)); std::apply([&](auto&&... args) { kernel::processBatchKernel<<>>(fitters_device, results_device, offset, nFits, args...); }, tracks_device); - // gpuCheckError(cudaEventRecord(stop[iBatch])); + gpuCheckError(cudaEventRecord(endKer[iBatch], stream)); gpuCheckError(cudaPeekAtLastError()); - gpuCheckError(cudaStreamSynchronize(stream)); iArg = 0; + gpuCheckError(cudaEventRecord(startIODown[iBatch], stream)); ([&] { gpuCheckError(cudaMemcpyAsync(args.data() + offset, tracks_device[iArg] + offset, sizeof(Tr) * nFits, cudaMemcpyDeviceToHost, stream)); ++iArg; }(), ...); + gpuCheckError(cudaMemcpyAsync(fitters.data() + offset, fitters_device + offset, sizeof(Fitter) * nFits, cudaMemcpyDeviceToHost, stream)); gpuCheckError(cudaMemcpyAsync(results.data() + offset, results_device + offset, sizeof(int) * nFits, cudaMemcpyDeviceToHost, stream)); + gpuCheckError(cudaEventRecord(endIODown[iBatch], stream)); } + ([&] { gpuInterface->unregisterBuffer(args.data()); }(), ...); - // gpuCheckError(cudaEventRecord(totalStop)); for (auto* tracksD : tracks_device) { gpuInterface->freeDevice(tracksD); @@ -214,11 +223,30 @@ void processBulk(const int nBlocks, gpuInterface->freeDevice(results_device); gpuInterface->unregisterBuffer(fitters.data()); gpuInterface->unregisterBuffer(results.data()); - // float milliseconds = 0; - // gpuCheckError(cudaEventElapsedTime(&milliseconds, start, stop)); - // LOGP(info, "Kernel run in: {} ms using {} blocks and {} threads.", milliseconds, nBlocks, nThreads); - // return results; + // Do benchmarks + gpuCheckError(cudaDeviceSynchronize()); + for (int iBatch{0}; iBatch < nBatches; ++iBatch) { + gpuCheckError(cudaEventElapsedTime(&ioUp[iBatch], startIOUp[iBatch], endIOUp[iBatch])); + gpuCheckError(cudaEventElapsedTime(&kerElapsed[iBatch], startKer[iBatch], endKer[iBatch])); + gpuCheckError(cudaEventElapsedTime(&ioDown[iBatch], startIODown[iBatch], endIODown[iBatch])); + } + + float totalUp = std::accumulate(ioUp.begin(), ioUp.end(), 0.f); + float totalDown = std::accumulate(ioDown.begin(), ioDown.end(), 0.f); + float totalKernels = std::accumulate(kerElapsed.begin(), kerElapsed.end(), 0.f); + LOGP(info, "Config: {} batches, {} blocks, {} threads", nBatches, nBlocks, nThreads); + LOGP(info, "Total I/O time: Up {} ms Avg {} ms, Down {} ms Avg {} ms", totalUp, totalUp / float(nBatches), totalDown, totalDown / (float)nBatches); + LOGP(info, "Total Kernel time: {} ms Avg {} ms", totalKernels, totalKernels / (float)nBatches); + + for (int iBatch{0}; iBatch < nBatches; ++iBatch) { + gpuCheckError(cudaEventDestroy(startIOUp[iBatch])); + gpuCheckError(cudaEventDestroy(endIOUp[iBatch])); + gpuCheckError(cudaEventDestroy(startIODown[iBatch])); + gpuCheckError(cudaEventDestroy(endIODown[iBatch])); + gpuCheckError(cudaEventDestroy(startKer[iBatch])); + gpuCheckError(cudaEventDestroy(endKer[iBatch])); + } } template void processBulk(const int, diff --git a/Common/DCAFitter/GPU/cuda/GPUInterface.cu b/Common/DCAFitter/GPU/cuda/GPUInterface.cu index 26529d41b294b..09f9cdc595dcd 100644 --- a/Common/DCAFitter/GPU/cuda/GPUInterface.cu +++ b/Common/DCAFitter/GPU/cuda/GPUInterface.cu @@ -9,6 +9,9 @@ // granted to it by virtue of its status as an Intergovernmental Organization // or submit itself to any jurisdiction. +/// \brief Helper interface to the GPU device, meant to be compatible with manual allocation/streams and GPUReconstruction ones. +/// \author matteo.concas@cern.ch + #ifdef __HIPCC__ #include "hip/hip_runtime.h" #else @@ -16,6 +19,7 @@ #endif #include +#include #include "DeviceInterface/GPUInterface.h" @@ -75,7 +79,8 @@ GPUInterface* GPUInterface::sGPUInterface = nullptr; GPUInterface* GPUInterface::Instance() { if (sGPUInterface == nullptr) { - sGPUInterface = new GPUInterface(8); // FIXME: get some configurable param to do so. + const auto* envValue = std::getenv("GPUINTERFACE_NSTREAMS"); + sGPUInterface = new GPUInterface(envValue == nullptr ? 8 : std::stoi(envValue)); } return sGPUInterface; } diff --git a/Common/DCAFitter/GPU/cuda/test/testDCAFitterNGPU.cxx b/Common/DCAFitter/GPU/cuda/test/testDCAFitterNGPU.cxx index 56da8e173bcf0..a7254931737cb 100644 --- a/Common/DCAFitter/GPU/cuda/test/testDCAFitterNGPU.cxx +++ b/Common/DCAFitter/GPU/cuda/test/testDCAFitterNGPU.cxx @@ -23,11 +23,6 @@ #include #include -#define nBlocks 30 -#define nThreads 256 -#define nStreams 8 -#define NTest 100001 - namespace o2 { namespace vertexing @@ -232,11 +227,11 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngs) meanDAW /= nfoundAW ? nfoundA : 1; meanDW /= nfoundW ? nfoundW : 1; LOG(info) << "Processed " << NTest << " 2-prong vertices Helix : Helix"; - LOG(info) << "2-prongs with abs.dist minization: eff= " << float(nfoundA) / NTest + LOG(info) << "2-prongs with abs.dist minimization: eff = " << float(nfoundA) / NTest << " mean.dist to truth: " << meanDA << " Total time: " << swA.CpuTime() * 1000 << " ms"; - LOG(info) << "2-prongs with abs.dist but wghPCA: eff= " << float(nfoundAW) / NTest + LOG(info) << "2-prongs with abs.dist but wghPCA: eff = " << float(nfoundAW) / NTest << " mean.dist to truth: " << meanDAW << " Total time: " << swAW.CpuTime() * 1000 << " ms"; - LOG(info) << "2-prongs with wgh.dist minization: eff= " << float(nfoundW) / NTest + LOG(info) << "2-prongs with wgh.dist minimization: eff = " << float(nfoundW) / NTest << " mean.dist to truth: " << meanDW << " Total time: " << swW.CpuTime() * 1000 << " ms"; BOOST_CHECK(nfoundA > 0.99 * NTest); BOOST_CHECK(nfoundAW > 0.99 * NTest); @@ -310,11 +305,11 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngs) meanDAW /= nfoundA ? nfoundA : 1; meanDW /= nfoundW ? nfoundW : 1; LOG(info) << "Processed " << NTest << " 2-prong vertices Helix : Helix from gamma conversion"; - LOG(info) << "2-prongs with abs.dist minization: eff= " << float(nfoundA) / NTest + LOG(info) << "2-prongs with abs.dist minimization: eff = " << float(nfoundA) / NTest << " mean.dist to truth: " << meanDA << " Total time: " << swA.CpuTime(); - LOG(info) << "2-prongs with abs.dist but wghPCA: eff= " << float(nfoundAW) / NTest + LOG(info) << "2-prongs with abs.dist but wghPCA: eff = " << float(nfoundAW) / NTest << " mean.dist to truth: " << meanDAW << " Total time: " << swAW.CpuTime(); - LOG(info) << "2-prongs with wgh.dist minization: eff= " << float(nfoundW) / NTest + LOG(info) << "2-prongs with wgh.dist minimization: eff = " << float(nfoundW) / NTest << " mean.dist to truth: " << meanDW << " Total time: " << swW.CpuTime(); BOOST_CHECK(nfoundA > 0.99 * NTest); BOOST_CHECK(nfoundAW > 0.99 * NTest); @@ -388,11 +383,11 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngs) meanDAW /= nfoundAW ? nfoundAW : 1; meanDW /= nfoundW ? nfoundW : 1; LOG(info) << "Processed " << NTest << " 2-prong vertices: Helix : Line"; - LOG(info) << "2-prongs with abs.dist minization: eff= " << float(nfoundA) / NTest + LOG(info) << "2-prongs with abs.dist minimization: eff = " << float(nfoundA) / NTest << " mean.dist to truth: " << meanDA << " Total time: " << swA.CpuTime(); - LOG(info) << "2-prongs with abs.dist but wghPCA: eff= " << float(nfoundAW) / NTest + LOG(info) << "2-prongs with abs.dist but wghPCA: eff = " << float(nfoundAW) / NTest << " mean.dist to truth: " << meanDAW << " Total time: " << swAW.CpuTime(); - LOG(info) << "2-prongs with wgh.dist minization: eff= " << float(nfoundW) / NTest + LOG(info) << "2-prongs with wgh.dist minimization: eff = " << float(nfoundW) / NTest << " mean.dist to truth: " << meanDW << " Total time: " << swW.CpuTime(); BOOST_CHECK(nfoundA > 0.99 * NTest); BOOST_CHECK(nfoundAW > 0.99 * NTest); @@ -465,11 +460,11 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngs) meanDAW /= nfoundAW ? nfoundAW : 1; meanDW /= nfoundW ? nfoundW : 1; LOG(info) << "Processed " << NTest << " 2-prong vertices: Line : Line"; - LOG(info) << "2-prongs with abs.dist minization: eff= " << float(nfoundA) / NTest + LOG(info) << "2-prongs with abs.dist minimization: eff = " << float(nfoundA) / NTest << " mean.dist to truth: " << meanDA << " Total time: " << swA.CpuTime(); - LOG(info) << "2-prongs with abs.dist but wghPCA: eff= " << float(nfoundAW) / NTest + LOG(info) << "2-prongs with abs.dist but wghPCA: eff = " << float(nfoundAW) / NTest << " mean.dist to truth: " << meanDAW << " Total time: " << swAW.CpuTime(); - LOG(info) << "2-prongs with wgh.dist minization: eff= " << float(nfoundW) / NTest + LOG(info) << "2-prongs with wgh.dist minimization: eff = " << float(nfoundW) / NTest << " mean.dist to truth: " << meanDW << " Total time: " << swW.CpuTime(); BOOST_CHECK(nfoundA > 0.99 * NTest); BOOST_CHECK(nfoundAW > 0.99 * NTest); @@ -541,11 +536,11 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngs) meanDAW /= nfoundAW ? nfoundAW : 1; meanDW /= nfoundW ? nfoundW : 1; LOG(info) << "Processed " << NTest << " 3-prong vertices"; - LOG(info) << "3-prongs with abs.dist minization: eff= " << float(nfoundA) / NTest + LOG(info) << "3-prongs with abs.dist minimization: eff = " << float(nfoundA) / NTest << " mean.dist to truth: " << meanDA << " Total time: " << swA.CpuTime(); - LOG(info) << "3-prongs with abs.dist but wghPCA: eff= " << float(nfoundAW) / NTest + LOG(info) << "3-prongs with abs.dist but wghPCA: eff = " << float(nfoundAW) / NTest << " mean.dist to truth: " << meanDAW << " Total time: " << swAW.CpuTime(); - LOG(info) << "3-prongs with wgh.dist minization: eff= " << float(nfoundW) / NTest + LOG(info) << "3-prongs with wgh.dist minimization: eff = " << float(nfoundW) / NTest << " mean.dist to truth: " << meanDW << " Total time: " << swW.CpuTime(); BOOST_CHECK(nfoundA > 0.99 * NTest); BOOST_CHECK(nfoundAW > 0.99 * NTest); @@ -561,8 +556,15 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngs) BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk) { - // gRandom->Delete(); - // gRandom = new TRandom(42); + const char* nThreadsEnvVarName = "DCAFITTERGPU_TEST_NTHREADS"; + const char* nBlocksEnvVarName = "DCAFITTERGPU_TEST_NBLOCKS"; + const char* nBatchesEnvVarName = "DCAFITTERGPU_TEST_NBATCHES"; + const char* nTestsEnvVarName = "DCAFITTERGPU_TEST_NTESTS"; + int nBlocks = std::getenv(nThreadsEnvVarName) == nullptr ? 30 : std::stoi(std::getenv(nThreadsEnvVarName)); + int nThreads = std::getenv(nBlocksEnvVarName) == nullptr ? 256 : std::stoi(std::getenv(nBlocksEnvVarName)); + int nBatches = std::getenv(nBatchesEnvVarName) == nullptr ? 8 : std::stoi(std::getenv(nBatchesEnvVarName)); + int NTest = std::getenv(nTestsEnvVarName) == nullptr ? 100001 : std::stoi(std::getenv(nTestsEnvVarName)); + o2::utils::TreeStreamRedirector outStreamB("dcafitterNTestBulk.root"); TGenPhaseSpace genPHS; @@ -614,7 +616,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk) swAb.Start(false); std::vector ncAb(NTest, 0); - device::processBulk(nBlocks, nThreads, nStreams, fitters_host, ncAb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES + device::processBulk(nBlocks, nThreads, nBatches, fitters_host, ncAb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES swAb.Stop(); for (int iev = 0; iev < NTest; iev++) { @@ -631,7 +633,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk) std::fill(fitters_host.begin(), fitters_host.end(), ft); swAWb.Start(false); std::vector ncAWb(NTest, 0); - device::processBulk(nBlocks, nThreads, nStreams, fitters_host, ncAWb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES + device::processBulk(nBlocks, nThreads, nBatches, fitters_host, ncAWb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES swAWb.Stop(); for (int iev = 0; iev < NTest; iev++) { @@ -648,7 +650,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk) std::fill(fitters_host.begin(), fitters_host.end(), ft); swWb.Start(false); std::vector ncWb(NTest, 0); - device::processBulk(nBlocks, nThreads, nStreams, fitters_host, ncWb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES + device::processBulk(nBlocks, nThreads, nBatches, fitters_host, ncWb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES swWb.Stop(); for (int iev = 0; iev < NTest; iev++) { @@ -664,11 +666,11 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk) meanDAWb /= nfoundAWb ? nfoundAWb : 1; meanDWb /= nfoundWb ? nfoundWb : 1; LOGP(info, "Bulk-processed {} 2-prong vertices Helix : Helix", NTest); - LOG(info) << "2-prongs with abs.dist minization: eff= " << float(nfoundAb) / NTest + LOG(info) << "2-prongs with abs.dist minimization: eff = " << float(nfoundAb) / NTest << " mean.dist to truth: " << meanDAb << " Total time: " << swAb.CpuTime() * 1000 << " ms"; - LOG(info) << "2-prongs with abs.dist but wghPCA: eff= " << float(nfoundAWb) / NTest + LOG(info) << "2-prongs with abs.dist but wghPCA: eff = " << float(nfoundAWb) / NTest << " mean.dist to truth: " << meanDAWb << " Total time: " << swAWb.CpuTime() * 1000 << " ms"; - LOG(info) << "2-prongs with wgh.dist minization: eff= " << float(nfoundWb) / NTest + LOG(info) << "2-prongs with wgh.dist minimization: eff = " << float(nfoundWb) / NTest << " mean.dist to truth: " << meanDWb << " Total time: " << swWb.CpuTime() * 1000 << " ms"; BOOST_CHECK(nfoundAb > 0.99 * NTest); BOOST_CHECK(nfoundAWb > 0.99 * NTest); @@ -713,7 +715,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk) swAb.Start(false); std::vector ncAb(NTest, 0); - device::processBulk(nBlocks, nThreads, nStreams, fitters_host, ncAb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES + device::processBulk(nBlocks, nThreads, nBatches, fitters_host, ncAb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES swAb.Stop(); for (int iev = 0; iev < NTest; iev++) { @@ -730,7 +732,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk) std::fill(fitters_host.begin(), fitters_host.end(), ft); swAWb.Start(false); std::vector ncAWb(NTest, 0); - device::processBulk(nBlocks, nThreads, nStreams, fitters_host, ncAWb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES + device::processBulk(nBlocks, nThreads, nBatches, fitters_host, ncAWb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES swAWb.Stop(); for (int iev = 0; iev < NTest; iev++) { @@ -747,7 +749,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk) std::fill(fitters_host.begin(), fitters_host.end(), ft); swWb.Start(false); std::vector ncWb(NTest, 0); - device::processBulk(nBlocks, nThreads, nStreams, fitters_host, ncWb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES + device::processBulk(nBlocks, nThreads, nBatches, fitters_host, ncWb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES swWb.Stop(); for (int iev = 0; iev < NTest; iev++) { @@ -764,11 +766,11 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk) meanDAWb /= nfoundAWb ? nfoundAWb : 1; meanDWb /= nfoundWb ? nfoundWb : 1; LOGP(info, "Bulk-processed {} 2-prong vertices Helix : Helix from gamma conversion", NTest); - LOG(info) << "2-prongs with abs.dist minization: eff= " << float(nfoundAb) / NTest + LOG(info) << "2-prongs with abs.dist minimization: eff = " << float(nfoundAb) / NTest << " mean.dist to truth: " << meanDAb << " Total time: " << swAb.CpuTime() * 1000 << " ms"; - LOG(info) << "2-prongs with abs.dist but wghPCA: eff= " << float(nfoundAWb) / NTest + LOG(info) << "2-prongs with abs.dist but wghPCA: eff = " << float(nfoundAWb) / NTest << " mean.dist to truth: " << meanDAWb << " Total time: " << swAWb.CpuTime() * 1000 << " ms"; - LOG(info) << "2-prongs with wgh.dist minization: eff= " << float(nfoundWb) / NTest + LOG(info) << "2-prongs with wgh.dist minimization: eff = " << float(nfoundWb) / NTest << " mean.dist to truth: " << meanDWb << " Total time: " << swWb.CpuTime() * 1000 << " ms"; BOOST_CHECK(nfoundAb > 0.99 * NTest); BOOST_CHECK(nfoundAWb > 0.99 * NTest); @@ -814,7 +816,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk) swAb.Start(false); std::vector ncAb(NTest, 0); - device::processBulk(nBlocks, nThreads, nStreams, fitters_host, ncAb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES + device::processBulk(nBlocks, nThreads, nBatches, fitters_host, ncAb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES swAb.Stop(); for (int iev = 0; iev < NTest; iev++) { @@ -831,7 +833,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk) std::fill(fitters_host.begin(), fitters_host.end(), ft); swAWb.Start(false); std::vector ncAWb(NTest, 0); - device::processBulk(nBlocks, nThreads, nStreams, fitters_host, ncAWb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES + device::processBulk(nBlocks, nThreads, nBatches, fitters_host, ncAWb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES swAWb.Stop(); for (int iev = 0; iev < NTest; iev++) { @@ -848,7 +850,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk) std::fill(fitters_host.begin(), fitters_host.end(), ft); swWb.Start(false); std::vector ncWb(NTest, 0); - device::processBulk(nBlocks, nThreads, nStreams, fitters_host, ncWb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES + device::processBulk(nBlocks, nThreads, nBatches, fitters_host, ncWb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES swWb.Stop(); for (int iev = 0; iev < NTest; iev++) { @@ -865,11 +867,11 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk) meanDAWb /= nfoundAWb ? nfoundAWb : 1; meanDWb /= nfoundWb ? nfoundWb : 1; LOG(info) << "Bulk-processed " << NTest << " 2-prong vertices: Helix : Line"; - LOG(info) << "2-prongs with abs.dist minization: eff= " << float(nfoundAb) / NTest + LOG(info) << "2-prongs with abs.dist minimization: eff = " << float(nfoundAb) / NTest << " mean.dist to truth: " << meanDAb << " Total time: " << swAb.CpuTime() * 1000 << " ms"; - LOG(info) << "2-prongs with abs.dist but wghPCA: eff= " << float(nfoundAWb) / NTest + LOG(info) << "2-prongs with abs.dist but wghPCA: eff = " << float(nfoundAWb) / NTest << " mean.dist to truth: " << meanDAWb << " Total time: " << swAWb.CpuTime() * 1000 << " ms"; - LOG(info) << "2-prongs with wgh.dist minization: eff= " << float(nfoundWb) / NTest + LOG(info) << "2-prongs with wgh.dist minimization: eff = " << float(nfoundWb) / NTest << " mean.dist to truth: " << meanDWb << " Total time: " << swWb.CpuTime() * 1000 << " ms"; BOOST_CHECK(nfoundAb > 0.99 * NTest); BOOST_CHECK(nfoundAWb > 0.99 * NTest); @@ -914,7 +916,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk) swAb.Start(false); std::vector ncAb(NTest, 0); - device::processBulk(nBlocks, nThreads, nStreams, fitters_host, ncAb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES + device::processBulk(nBlocks, nThreads, nBatches, fitters_host, ncAb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES swAb.Stop(); for (int iev = 0; iev < NTest; iev++) { @@ -931,7 +933,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk) std::fill(fitters_host.begin(), fitters_host.end(), ft); swAWb.Start(false); std::vector ncAWb(NTest, 0); - device::processBulk(nBlocks, nThreads, nStreams, fitters_host, ncAWb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES + device::processBulk(nBlocks, nThreads, nBatches, fitters_host, ncAWb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES swAWb.Stop(); for (int iev = 0; iev < NTest; iev++) { LOG(debug) << "fit abs.dist " << iev << " NC: " << ncAWb[iev] << " Chi2: " << (ncAWb[iev] ? fitters_host[iev].getChi2AtPCACandidate(0) : -1); @@ -948,7 +950,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk) swWb.Start(false); std::vector ncWb(NTest, 0); - device::processBulk(nBlocks, nThreads, nStreams, fitters_host, ncWb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES + device::processBulk(nBlocks, nThreads, nBatches, fitters_host, ncWb, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES swWb.Stop(); for (int iev = 0; iev < NTest; iev++) { @@ -964,11 +966,11 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk) meanDAWb /= nfoundAWb ? nfoundAWb : 1; meanDWb /= nfoundWb ? nfoundWb : 1; LOG(info) << "Bulk-processed " << NTest << " 2-prong vertices: Line : Line"; - LOG(info) << "2-prongs with abs.dist minization: eff= " << float(nfoundAb) / NTest + LOG(info) << "2-prongs with abs.dist minimization: eff = " << float(nfoundAb) / NTest << " mean.dist to truth: " << meanDAb << " Total time: " << swAb.CpuTime() * 1000 << " ms"; - LOG(info) << "2-prongs with abs.dist but wghPCA: eff= " << float(nfoundAWb) / NTest + LOG(info) << "2-prongs with abs.dist but wghPCA: eff = " << float(nfoundAWb) / NTest << " mean.dist to truth: " << meanDAWb << " Total time: " << swAWb.CpuTime() * 1000 << " ms"; - LOG(info) << "2-prongs with wgh.dist minization: eff= " << float(nfoundWb) / NTest + LOG(info) << "2-prongs with wgh.dist minimization: eff = " << float(nfoundWb) / NTest << " mean.dist to truth: " << meanDWb << " Total time: " << swWb.CpuTime() * 1000 << " ms"; BOOST_CHECK(nfoundAb > 0.99 * NTest); BOOST_CHECK(nfoundAWb > 0.99 * NTest); @@ -1014,7 +1016,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk) std::fill(fitters_host.begin(), fitters_host.end(), ft); swAb.Start(false); std::vector ncAb(NTest, 0); - device::processBulk(nBlocks, nThreads, nStreams, fitters_host, ncAb, vctracks[0], vctracks[1], vctracks[2]); // HERE WE FIT THE VERTICES + device::processBulk(nBlocks, nThreads, nBatches, fitters_host, ncAb, vctracks[0], vctracks[1], vctracks[2]); // HERE WE FIT THE VERTICES swAb.Stop(); for (int iev = 0; iev < NTest; iev++) { LOG(debug) << "fit abs.dist " << iev << " NC: " << ncAb[iev] << " Chi2: " << (ncAb[iev] ? fitters_host[iev].getChi2AtPCACandidate(0) : -1); @@ -1031,7 +1033,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk) swAWb.Start(false); std::vector ncAWb(NTest, 0); - device::processBulk(nBlocks, nThreads, nStreams, fitters_host, ncAWb, vctracks[0], vctracks[1], vctracks[2]); // HERE WE FIT THE VERTICES + device::processBulk(nBlocks, nThreads, nBatches, fitters_host, ncAWb, vctracks[0], vctracks[1], vctracks[2]); // HERE WE FIT THE VERTICES swAWb.Stop(); for (int iev = 0; iev < NTest; iev++) { LOG(debug) << "fit abs.dist " << iev << " NC: " << ncAWb[iev] << " Chi2: " << (ncAWb[iev] ? fitters_host[iev].getChi2AtPCACandidate(0) : -1); @@ -1048,7 +1050,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk) swWb.Start(false); std::vector ncWb(NTest, 0); - device::processBulk(nBlocks, nThreads, nStreams, fitters_host, ncWb, vctracks[0], vctracks[1], vctracks[2]); // HERE WE FIT THE VERTICES + device::processBulk(nBlocks, nThreads, nBatches, fitters_host, ncWb, vctracks[0], vctracks[1], vctracks[2]); // HERE WE FIT THE VERTICES swWb.Stop(); for (int iev = 0; iev < NTest; iev++) { LOG(debug) << "fit wgh.dist " << iev << " NC: " << ncWb[iev] << " Chi2: " << (ncWb[iev] ? fitters_host[iev].getChi2AtPCACandidate(0) : -1); @@ -1064,11 +1066,11 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk) meanDAWb /= nfoundAWb ? nfoundAWb : 1; meanDWb /= nfoundWb ? nfoundWb : 1; LOG(info) << "Bulk-processed " << NTest << " 3-prong vertices"; - LOG(info) << "3-prongs with abs.dist minization: eff= " << float(nfoundAb) / NTest + LOG(info) << "3-prongs with abs.dist minimization: eff = " << float(nfoundAb) / NTest << " mean.dist to truth: " << meanDAb << " Total time: " << swAb.CpuTime() * 1000 << " ms"; - LOG(info) << "3-prongs with abs.dist but wghPCA: eff= " << float(nfoundAWb) / NTest + LOG(info) << "3-prongs with abs.dist but wghPCA: eff = " << float(nfoundAWb) / NTest << " mean.dist to truth: " << meanDAWb << " Total time: " << swAWb.CpuTime() * 1000 << " ms"; - LOG(info) << "3-prongs with wgh.dist minization: eff= " << float(nfoundWb) / NTest + LOG(info) << "3-prongs with wgh.dist minimization: eff = " << float(nfoundWb) / NTest << " mean.dist to truth: " << meanDWb << " Total time: " << swWb.CpuTime() * 1000 << " ms"; BOOST_CHECK(nfoundAb > 0.99 * NTest); BOOST_CHECK(nfoundAWb > 0.99 * NTest); diff --git a/Common/DCAFitter/GPU/hip/CMakeLists.txt b/Common/DCAFitter/GPU/hip/CMakeLists.txt index a804775d051c7..f62759bb6ea2c 100644 --- a/Common/DCAFitter/GPU/hip/CMakeLists.txt +++ b/Common/DCAFitter/GPU/hip/CMakeLists.txt @@ -20,7 +20,7 @@ o2_add_hipified_library(DCAFitterHIP hip::host PRIVATE_LINK_LIBRARIES O2::GPUTrackingHIPExternalProvider TARGETVARNAME targetNAme) - + o2_add_test(DCAFitterNHIP SOURCES ../cuda/test/testDCAFitterNGPU.cxx PUBLIC_LINK_LIBRARIES O2::ReconstructionDataFormats