Skip to content

Commit

Permalink
Add processBulk to perform fits in parallel
Browse files Browse the repository at this point in the history
  • Loading branch information
mconcas committed Sep 18, 2024
1 parent c981ee0 commit 350cf3c
Show file tree
Hide file tree
Showing 4 changed files with 209 additions and 26 deletions.
96 changes: 81 additions & 15 deletions Common/DCAFitter/GPU/cuda/DCAFitterN.cu
Original file line number Diff line number Diff line change
Expand Up @@ -37,33 +37,42 @@ namespace o2::vertexing::device
namespace kernel
{
template <typename Fitter>
GPUg() void printKernel(Fitter* ft)
GPUg() void printKernel(Fitter* fitter)
{
if (threadIdx.x == 0) {
printf(" =============== GPU DCA Fitter %d prongs ================\n", Fitter::getNProngs());
ft->print();
printf(" =============== GPU DCA Fitter %d prongs =================\n", Fitter::getNProngs());
fitter->print();
printf(" =========================================================\n");
}
}

template <typename Fitter, typename... Tr>
GPUg() void processKernel(Fitter* ft, int* res, Tr*... tracks)
GPUg() void processKernel(Fitter* fitter, int* res, Tr*... tracks)
{
*res = ft->process(*tracks...);
*res = fitter->process(*tracks...);
}

template <typename Fitter, typename... Tr>
GPUg() void processBulkKernel(Fitter* fitters, int* results, unsigned int N, Tr*... tracks)
{
for (auto iThread{blockIdx.x * blockDim.x + threadIdx.x}; iThread < N; iThread += blockDim.x * gridDim.x) {
results[iThread] = fitters[iThread].process(tracks[iThread]...);
}
}

} // namespace kernel

/// CPU handlers
template <typename Fitter>
void print(const int nBlocks,
const int nThreads,
Fitter& ft)
Fitter& fitter)
{
Fitter* ft_device;
gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&ft_device), sizeof(Fitter)));
gpuCheckError(cudaMemcpy(ft_device, &ft, sizeof(Fitter), cudaMemcpyHostToDevice));
Fitter* fitter_device;
gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&fitter_device), sizeof(Fitter)));
gpuCheckError(cudaMemcpy(fitter_device, &fitter, sizeof(Fitter), cudaMemcpyHostToDevice));

kernel::printKernel<<<nBlocks, nThreads>>>(ft_device);
kernel::printKernel<<<nBlocks, nThreads>>>(fitter_device);

gpuCheckError(cudaPeekAtLastError());
gpuCheckError(cudaDeviceSynchronize());
Expand All @@ -75,11 +84,11 @@ int process(const int nBlocks,
Fitter& fitter,
Tr&... args)
{
Fitter* ft_device;
Fitter* fitter_device;
std::array<o2::track::TrackParCov*, Fitter::getNProngs()> tracks_device;
int result, *result_device;

gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&ft_device), sizeof(Fitter)));
gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&fitter_device), sizeof(Fitter)));
gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&result_device), sizeof(int)));

int iArg{0};
Expand All @@ -90,15 +99,15 @@ int process(const int nBlocks,
}(),
...);

gpuCheckError(cudaMemcpy(ft_device, &fitter, sizeof(Fitter), cudaMemcpyHostToDevice));
gpuCheckError(cudaMemcpy(fitter_device, &fitter, sizeof(Fitter), cudaMemcpyHostToDevice));

std::apply([&](auto&&... args) { kernel::processKernel<<<nBlocks, nThreads>>>(ft_device, result_device, args...); }, tracks_device);
std::apply([&](auto&&... args) { kernel::processKernel<<<nBlocks, nThreads>>>(fitter_device, result_device, args...); }, tracks_device);

gpuCheckError(cudaPeekAtLastError());
gpuCheckError(cudaDeviceSynchronize());

gpuCheckError(cudaMemcpy(&result, result_device, sizeof(int), cudaMemcpyDeviceToHost));
gpuCheckError(cudaMemcpy(&fitter, ft_device, sizeof(Fitter), cudaMemcpyDeviceToHost));
gpuCheckError(cudaMemcpy(&fitter, fitter_device, sizeof(Fitter), cudaMemcpyDeviceToHost));
iArg = 0;
([&] {
gpuCheckError(cudaMemcpy(&args, tracks_device[iArg], sizeof(o2::track::TrackParCov), cudaMemcpyDeviceToHost));
Expand All @@ -112,6 +121,63 @@ int process(const int nBlocks,
return result;
}

template <typename Fitter, class... Tr>
std::vector<int> processBulk(const int nBlocks,
const int nThreads,
std::vector<Fitter>& fitters,
std::vector<Tr>&... args)
{
cudaEvent_t start, stop;
gpuCheckError(cudaEventCreate(&start));
gpuCheckError(cudaEventCreate(&stop));
const auto nFits{fitters.size()}; // for clarity: size of all the vectors needs to be equal, not enforcing it here yet.
std::vector<int> results(nFits);
int* results_device;
Fitter* fitters_device;
std::array<o2::track::TrackParCov*, Fitter::getNProngs()> tracks_device;

int iArg{0};
([&] {
gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&(tracks_device[iArg])), sizeof(Tr) * args.size()));
gpuCheckError(cudaMemcpy(tracks_device[iArg], args.data(), sizeof(Tr) * args.size(), cudaMemcpyHostToDevice));
++iArg;
}(),
...);
gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&results_device), sizeof(int) * nFits));
gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&fitters_device), sizeof(Fitter) * nFits));
gpuCheckError(cudaMemcpy(fitters_device, fitters.data(), sizeof(Fitter) * nFits, cudaMemcpyHostToDevice));

// Call kernel
gpuCheckError(cudaEventRecord(start));
std::apply([&](auto&&... args) { kernel::processBulkKernel<<<nBlocks, nThreads>>>(fitters_device, results_device, nFits, args...); }, tracks_device);
gpuCheckError(cudaEventRecord(stop));

gpuCheckError(cudaPeekAtLastError());
gpuCheckError(cudaDeviceSynchronize());

gpuCheckError(cudaMemcpy(results.data(), results_device, sizeof(int) * results.size(), cudaMemcpyDeviceToHost));
gpuCheckError(cudaMemcpy(fitters.data(), fitters_device, sizeof(Fitter) * nFits, cudaMemcpyDeviceToHost));

iArg = 0;
([&] {
gpuCheckError(cudaMemcpy(args.data(), tracks_device[iArg], sizeof(Tr) * args.size(), cudaMemcpyDeviceToHost));
gpuCheckError(cudaFree(tracks_device[iArg]));
++iArg;
}(),
...);

gpuCheckError(cudaFree(fitters_device));
gpuCheckError(cudaFree(results_device));
gpuCheckError(cudaEventSynchronize(stop));

float milliseconds = 0;
gpuCheckError(cudaEventElapsedTime(&milliseconds, start, stop));

LOGP(info, "Kernel run in: {} ms using {} blocks and {} threads.", milliseconds, nBlocks, nThreads);
return results;
}

template std::vector<int> processBulk(const int, const int, std::vector<o2::vertexing::DCAFitterN<2>>&, std::vector<o2::track::TrackParCov>&, std::vector<o2::track::TrackParCov>&);
template int process(const int, const int, o2::vertexing::DCAFitterN<2>&, o2::track::TrackParCov&, o2::track::TrackParCov&);
template int process(const int, const int, o2::vertexing::DCAFitterN<3>&, o2::track::TrackParCov&, o2::track::TrackParCov&, o2::track::TrackParCov&);
template void print(const int, const int, o2::vertexing::DCAFitterN<2>&);
Expand Down
121 changes: 120 additions & 1 deletion Common/DCAFitter/GPU/cuda/test/testDCAFitterNGPU.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngs)
}
device::print(1, 1, ft);
meanDA /= nfoundA ? nfoundA : 1;
meanDAW /= nfoundA ? nfoundA : 1;
meanDAW /= nfoundAW ? nfoundA : 1;
meanDW /= nfoundW ? nfoundW : 1;
LOG(info) << "Processed " << NTest << " 2-prong vertices Helix : Helix";
LOG(info) << "2-prongs with abs.dist minization: eff= " << float(nfoundA) / NTest
Expand Down Expand Up @@ -554,5 +554,124 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngs)
outStream.Close();
}

BOOST_AUTO_TEST_CASE(DCAFitterNProngsBulk)
{
gRandom->Delete();
gRandom = new TRandom(42);
constexpr int NTest = 10000;
o2::utils::TreeStreamRedirector outStreamB("dcafitterNTestBulk.root");

TGenPhaseSpace genPHS;
constexpr double ele = 0.00051;
constexpr double gamma = 2 * ele + 1e-6;
constexpr double pion = 0.13957;
constexpr double k0 = 0.49761;
constexpr double kch = 0.49368;
constexpr double dch = 1.86965;
std::vector<double> gammadec = {ele, ele};
std::vector<double> k0dec = {pion, pion};
std::vector<double> dchdec = {pion, kch, pion};
std::vector<std::vector<o2::track::TrackParCov>> vctracks(2, std::vector<o2::track::TrackParCov>(NTest));
std::vector<Vec3D> vtxGen(NTest);

double bz = 5.0;
{ // 2 prongs vertices bulk processing
LOG(info) << "Bulk-processing 2-prong Helix - Helix case";
std::vector<int> forceQ{1, 1};

o2::vertexing::DCAFitterN<2> ft; // 2 prong fitter
ft.setBz(bz);
ft.setPropagateToPCA(true); // After finding the vertex, propagate tracks to the DCA. This is default anyway
ft.setMaxR(200); // do not consider V0 seeds with 2D circles crossing above this R. This is default anyway
ft.setMaxDZIni(4); // do not consider V0 seeds with tracks Z-distance exceeding this. This is default anyway
ft.setMaxDXYIni(4); // do not consider V0 seeds with tracks XY-distance exceeding this. This is default anyway
ft.setMinParamChange(1e-3); // stop iterations if max correction is below this value. This is default anyway
ft.setMinRelChi2Change(0.9); // stop iterations if chi2 improves by less that this factor

std::vector<o2::vertexing::DCAFitterN<2>> fitters_host;
std::vector<TLorentzVector> genParents(NTest);
fitters_host.resize(NTest);

std::string treeName2Abulk = "pr2aBulk", treeName2AWbulk = "pr2awBulk", treeName2Wbulk = "pr2wBulk";
TStopwatch swAb, swAWb, swWb;
int nfoundAb = 0, nfoundAWb = 0, nfoundWb = 0;
double meanDAb = 0, meanDAWb = 0, meanDWb = 0;
swAb.Stop();
swAWb.Stop();
swWb.Stop();

ft.setUseAbsDCA(true);
std::fill(fitters_host.begin(), fitters_host.end(), ft);
for (int iev = 0; iev < NTest; iev++) {
std::vector<o2::track::TrackParCov> vc(2);
genParents[iev] = generate(vtxGen[iev], vc, bz, genPHS, k0, k0dec, forceQ);
vctracks[0][iev] = vc[0];
vctracks[1][iev] = vc[1];
}

swAb.Start(false);
auto ncAb = device::processBulk(20, 512, fitters_host, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
swAb.Stop();

for (int iev = 0; iev < NTest; iev++) {
LOG(debug) << "fit abs.dist " << iev << " NC: " << ncAb[iev] << " Chi2: " << (ncAb[iev] ? fitters_host[iev].getChi2AtPCACandidate(0) : -1);
if (ncAb[iev]) {
auto minDb = checkResults(outStreamB, treeName2Abulk, fitters_host[iev], vtxGen[iev], genParents[iev], k0dec);
meanDAb += minDb;
nfoundAb++;
}
}

ft.setUseAbsDCA(true);
ft.setWeightedFinalPCA(true);
std::fill(fitters_host.begin(), fitters_host.end(), ft);
swAWb.Start(false);
auto ncAWb = device::processBulk(20, 512, fitters_host, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
swAWb.Stop();

for (int iev = 0; iev < NTest; iev++) {
LOG(debug) << "fit abs.dist with final weighted DCA " << iev << " NC: " << ncAWb[iev] << " Chi2: " << (ncAWb[iev] ? fitters_host[iev].getChi2AtPCACandidate(0) : -1);
if (ncAWb[iev]) {
auto minDb = checkResults(outStreamB, treeName2AWbulk, fitters_host[iev], vtxGen[iev], genParents[iev], k0dec);
meanDAWb += minDb;
nfoundAWb++;
}
}

ft.setUseAbsDCA(false);
ft.setWeightedFinalPCA(false);
std::fill(fitters_host.begin(), fitters_host.end(), ft);
swWb.Start(false);
auto ncWb = device::processBulk(20, 512, fitters_host, vctracks[0], vctracks[1]); // HERE WE FIT THE VERTICES
swWb.Stop();

for (int iev = 0; iev < NTest; iev++) {
LOG(debug) << "fit wgh.dist " << iev << " NC: " << ncWb[iev] << " Chi2: " << (ncWb[iev] ? fitters_host[iev].getChi2AtPCACandidate(0) : -1);
if (ncWb[iev]) {
auto minDb = checkResults(outStreamB, treeName2Wbulk, fitters_host[iev], vtxGen[iev], genParents[iev], k0dec);
meanDWb += minDb;
nfoundWb++;
}
}

meanDAb /= nfoundAb ? nfoundAb : 1;
meanDAWb /= nfoundAWb ? nfoundAb : 1;
meanDWb /= nfoundWb ? nfoundWb : 1;
LOGP(info, "Processed {} 2-prong vertices Helix : Helix", NTest);
LOG(info) << "2-prongs with abs.dist minization: eff= " << float(nfoundAb) / NTest
<< " mean.dist to truth: " << meanDAb << " GPU time: " << swAb.CpuTime();
LOG(info) << "2-prongs with abs.dist but wghPCA: eff= " << float(nfoundAWb) / NTest
<< " mean.dist to truth: " << meanDAWb << " GPU time: " << swAWb.CpuTime();
LOG(info) << "2-prongs with wgh.dist minization: eff= " << float(nfoundWb) / NTest
<< " mean.dist to truth: " << meanDWb << " GPU time: " << swWb.CpuTime();
BOOST_CHECK(nfoundAb > 0.99 * NTest);
BOOST_CHECK(nfoundAWb > 0.99 * NTest);
BOOST_CHECK(nfoundWb > 0.99 * NTest);
BOOST_CHECK(meanDAb < 0.1);
BOOST_CHECK(meanDAWb < 0.1);
BOOST_CHECK(meanDWb < 0.1);
}
}

} // namespace vertexing
} // namespace o2
16 changes: 7 additions & 9 deletions Common/DCAFitter/include/DCAFitter/DCAFitterN.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@
#ifndef _ALICEO2_DCA_FITTERN_
#define _ALICEO2_DCA_FITTERN_

#include "MathUtils/Cartesian.h"
#include "ReconstructionDataFormats/Track.h"
#include "DCAFitter/HelixHelper.h"
#include "DetectorsBase/Propagator.h"
#include "MathUtils/Cartesian.h"
#include "ReconstructionDataFormats/Track.h"

namespace o2
{
Expand Down Expand Up @@ -1136,15 +1136,13 @@ using DCAFitter3 = DCAFitterN<3, o2::track::TrackParCov>;
namespace device
{
template <typename Fitter>
void print(const int nBlocks,
const int nThreads,
Fitter& ft);
void print(const int nBlocks, const int nThreads, Fitter& ft);

template <typename Fitter, class... Tr>
int process(const int nBlocks,
const int nThreads,
Fitter&,
Tr&... args);
int process(const int nBlocks, const int nThreads, Fitter&, Tr&... args);

template <class Fitter, class... Tr>
std::vector<int> processBulk(const int nBlocks, const int nThreads, std::vector<Fitter>& fitters, std::vector<Tr>&... args);
} // namespace device

} // namespace vertexing
Expand Down
2 changes: 1 addition & 1 deletion Common/DCAFitter/test/testDCAFitterN.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ BOOST_AUTO_TEST_CASE(DCAFitterNProngs)
}
ft.print();
meanDA /= nfoundA ? nfoundA : 1;
meanDAW /= nfoundA ? nfoundA : 1;
meanDAW /= nfoundAW ? nfoundA : 1;
meanDW /= nfoundW ? nfoundW : 1;
LOG(info) << "Processed " << NTest << " 2-prong vertices Helix : Helix";
LOG(info) << "2-prongs with abs.dist minization: eff= " << float(nfoundA) / NTest
Expand Down

0 comments on commit 350cf3c

Please sign in to comment.