Skip to content

Commit

Permalink
add kernel metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
mconcas committed Oct 2, 2024
1 parent d537a7e commit 83714da
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 4 deletions.
13 changes: 11 additions & 2 deletions Common/DCAFitter/GPU/cuda/DCAFitterN.cu
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,22 @@ GPUg() void printKernel(Fitter* fitter)
}
}

template <typename Fitter>
GPUg() void initFitters(Fitter* fitters, unsigned int off, unsigned int N)
{
for (auto iThread{blockIdx.x * blockDim.x + threadIdx.x + 1}; iThread < N; iThread += blockDim.x * gridDim.x) {
fitters[iThread + off] = fitters[off];
}
}

template <typename Fitter, typename... Tr>
GPUg() void processKernel(Fitter* fitter, int* res, Tr*... tracks)
{
*res = fitter->process(*tracks...);
}

template <typename Fitter, typename... Tr>
GPUg() void processBatchKernel(Fitter* fitters, int* results, size_t off, size_t N, Tr*... tracks)
GPUg() void processBatchKernel(Fitter* fitters, int* results, unsigned int off, unsigned int N, Tr*... tracks)
{
for (auto iThread{blockIdx.x * blockDim.x + threadIdx.x}; iThread < N; iThread += blockDim.x * gridDim.x) {
results[iThread + off] = fitters[iThread + off].process(tracks[iThread + off]...);
Expand Down Expand Up @@ -186,7 +194,7 @@ void processBulk(const int nBlocks,
auto nFits = batchSize + (iBatch < remainder ? 1 : 0);
gpuCheckError(cudaEventRecord(startIOUp[iBatch], stream));
gpuCheckError(cudaMemcpyAsync(fitters_device + offset, fitters.data() + offset, sizeof(Fitter) * nFits, cudaMemcpyHostToDevice, stream));
gpuCheckError(cudaMemcpyAsync(fitters_device + offset, fitters.data() + offset, sizeof(Fitter) /* * nFits */, cudaMemcpyHostToDevice, stream)); // copying just the first element of the buffer
iArg = 0;
([&] {
gpuCheckError(cudaMemcpyAsync(tracks_device[iArg] + offset, args.data() + offset, sizeof(Tr) * nFits, cudaMemcpyHostToDevice, stream));
Expand All @@ -196,6 +204,7 @@ void processBulk(const int nBlocks,
gpuCheckError(cudaEventRecord(endIOUp[iBatch], stream));
gpuCheckError(cudaEventRecord(startKer[iBatch], stream));
kernel::initFitters<<<nBlocks, nThreads, 0, stream>>>(fitters_device, offset, nFits);
std::apply([&](auto&&... args) { kernel::processBatchKernel<<<nBlocks, nThreads, 0, stream>>>(fitters_device, results_device, offset, nFits, args...); }, tracks_device);
gpuCheckError(cudaEventRecord(endKer[iBatch], stream));
Expand Down
11 changes: 10 additions & 1 deletion Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -721,6 +721,10 @@ void trackSeedHandler(CellSeed* trackSeeds,
const int nBlocks,
const int nThreads)
{
cudaEvent_t start, stop;
gpuCheckError(cudaEventCreate(&start));
gpuCheckError(cudaEventCreate(&stop));
gpuCheckError(cudaEventRecord(start));
gpu::fitTrackSeedsKernel<<<nBlocks, nThreads>>>(
trackSeeds, // CellSeed* trackSeeds,
foundTrackingFrameInfo, // TrackingFrameInfo** foundTrackingFrameInfo,
Expand All @@ -732,8 +736,13 @@ void trackSeedHandler(CellSeed* trackSeeds,
maxChi2NDF, // float maxChi2NDF,
propagator, // const o2::base::Propagator* propagator
matCorrType); // o2::base::PropagatorF::MatCorrType matCorrType

gpuCheckError(cudaEventRecord(stop));
gpuCheckError(cudaPeekAtLastError());
gpuCheckError(cudaDeviceSynchronize());
gpuCheckError(cudaEventSynchronize(stop));
float milliseconds = 0;
gpuCheckError(cudaEventElapsedTime(&milliseconds, start, stop));

LOGP(info, "Parallel fit took: {} ms", milliseconds);
}
} // namespace o2::its
7 changes: 6 additions & 1 deletion Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
#include "ITStracking/Tracklet.h"
#include "ReconstructionDataFormats/Track.h"

#include <TStopwatch.h>

#ifdef WITH_OPENMP
#include <omp.h>
#endif
Expand Down Expand Up @@ -600,6 +602,8 @@ void TrackerTraits::findRoads(const int iteration)

std::vector<TrackITSExt> tracks(trackSeeds.size());
std::atomic<size_t> trackIndex{0};
TStopwatch timer;
timer.Start();
#pragma omp parallel for num_threads(mNThreads)
for (size_t seedId = 0; seedId < trackSeeds.size(); ++seedId) {
const CellSeed& seed{trackSeeds[seedId]};
Expand All @@ -623,7 +627,8 @@ void TrackerTraits::findRoads(const int iteration)
}
tracks[trackIndex++] = temporaryTrack;
}

timer.Stop();
LOGP(info, "Parallel fit took: {:2.3} ms", timer.RealTime() * 1000);
tracks.resize(trackIndex);
std::sort(tracks.begin(), tracks.end(), [](const TrackITSExt& a, const TrackITSExt& b) {
return a.getChi2() < b.getChi2();
Expand Down

0 comments on commit 83714da

Please sign in to comment.