Skip to content

Commit

Permalink
Improve scheduling, benchmarking, and config
Browse files Browse the repository at this point in the history
  • Loading branch information
mconcas committed Oct 1, 2024
1 parent 1a33857 commit 9d34af4
Show file tree
Hide file tree
Showing 6 changed files with 113 additions and 77 deletions.
1 change: 1 addition & 0 deletions Common/DCAFitter/GPU/DeviceInterface/GPUInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

#include <thread>
#include <vector>
#include <atomic>

namespace o2
{
Expand Down
2 changes: 1 addition & 1 deletion Common/DCAFitter/GPU/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ o2_add_library(DCAFitterCUDA
O2::DetectorsBase
PRIVATE_LINK_LIBRARIES O2::GPUTrackingCUDAExternalProvider)
set_property(TARGET ${targetName} PROPERTY CUDA_SEPARABLE_COMPILATION ON)
add_compile_options(-lineinfo)
# add_compile_options(-lineinfo)

o2_add_test(DCAFitterNCUDA
SOURCES test/testDCAFitterNGPU.cxx
Expand Down
72 changes: 50 additions & 22 deletions Common/DCAFitter/GPU/cuda/DCAFitterN.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
#include <cuda.h>
#endif

#include <numeric>

#include "GPUCommonDef.h"
#include "DCAFitter/DCAFitterN.h"
#include "DeviceInterface/GPUInterface.h"
Expand Down Expand Up @@ -133,23 +135,25 @@ int process(const int nBlocks,
template <typename Fitter, class... Tr>
void processBulk(const int nBlocks,
const int nThreads,
const int nStreams,
const int nBatches,
std::vector<Fitter>& fitters,
std::vector<int>& results,
std::vector<Tr>&... args)
{
auto* gpuInterface = GPUInterface::Instance();
kernel::warmUpGpuKernel<<<1, 1>>>();
kernel::warmUpGpuKernel<<<1, 1, 0, gpuInterface->getNextStream()>>>();
// Benchmarking events
// std::vector<cudaEvent_t> start(nStreams), stop(nStreams);
// cudaEvent_t totalStart, totalStop;
// gpuCheckError(cudaEventCreate(&totalStart));
// gpuCheckError(cudaEventCreate(&totalStop));
// for (int iBatch{0}; iBatch < nStreams; ++iBatch) {
// gpuCheckError(cudaEventCreate(&start[iBatch]));
// gpuCheckError(cudaEventCreate(&stop[iBatch]));
// }
std::vector<float> ioUp(nBatches), ioDown(nBatches), kerElapsed(nBatches);
std::vector<cudaEvent_t> startIOUp(nBatches), endIOUp(nBatches), startIODown(nBatches), endIODown(nBatches), startKer(nBatches), endKer(nBatches);
for (int iBatch{0}; iBatch < nBatches; ++iBatch) {
gpuCheckError(cudaEventCreate(&startIOUp[iBatch]));
gpuCheckError(cudaEventCreate(&endIOUp[iBatch]));
gpuCheckError(cudaEventCreate(&startIODown[iBatch]));
gpuCheckError(cudaEventCreate(&endIODown[iBatch]));
gpuCheckError(cudaEventCreate(&startKer[iBatch]));
gpuCheckError(cudaEventCreate(&endKer[iBatch]));
}
// Tracks
std::array<o2::track::TrackParCov*, Fitter::getNProngs()> tracks_device;
Expand All @@ -171,40 +175,45 @@ void processBulk(const int nBlocks,
int* results_device;
gpuInterface->allocDevice(reinterpret_cast<void**>(&results_device), sizeof(int) * fitters.size());
// gpuCheckError(cudaEventRecord(totalStart));
// R.R. Computation
int totalSize = fitters.size();
int batchSize = totalSize / nStreams;
int remainder = totalSize % nStreams;
int batchSize = totalSize / nBatches;
int remainder = totalSize % nBatches;
for (int iBatch{0}; iBatch < nStreams; ++iBatch) {
for (int iBatch{0}; iBatch < nBatches; ++iBatch) {
auto& stream = gpuInterface->getNextStream();
auto offset = iBatch * batchSize + std::min(iBatch, remainder);
auto nFits = batchSize + (iBatch < remainder ? 1 : 0);
gpuCheckError(cudaEventRecord(startIOUp[iBatch], stream));
gpuCheckError(cudaMemcpyAsync(fitters_device + offset, fitters.data() + offset, sizeof(Fitter) * nFits, cudaMemcpyHostToDevice, stream));
iArg = 0;
([&] {
gpuCheckError(cudaMemcpyAsync(tracks_device[iArg] + offset, args.data() + offset, sizeof(Tr) * nFits, cudaMemcpyHostToDevice, stream));
++iArg;
}(),
...);
// gpuCheckError(cudaEventRecord(start[iBatch]));
gpuCheckError(cudaEventRecord(endIOUp[iBatch], stream));
gpuCheckError(cudaEventRecord(startKer[iBatch], stream));
std::apply([&](auto&&... args) { kernel::processBatchKernel<<<nBlocks, nThreads, 0, stream>>>(fitters_device, results_device, offset, nFits, args...); }, tracks_device);
// gpuCheckError(cudaEventRecord(stop[iBatch]));
gpuCheckError(cudaEventRecord(endKer[iBatch], stream));
gpuCheckError(cudaPeekAtLastError());
gpuCheckError(cudaStreamSynchronize(stream));
iArg = 0;
gpuCheckError(cudaEventRecord(startIODown[iBatch], stream));
([&] {
gpuCheckError(cudaMemcpyAsync(args.data() + offset, tracks_device[iArg] + offset, sizeof(Tr) * nFits, cudaMemcpyDeviceToHost, stream));
++iArg;
}(),
...);
gpuCheckError(cudaMemcpyAsync(fitters.data() + offset, fitters_device + offset, sizeof(Fitter) * nFits, cudaMemcpyDeviceToHost, stream));
gpuCheckError(cudaMemcpyAsync(results.data() + offset, results_device + offset, sizeof(int) * nFits, cudaMemcpyDeviceToHost, stream));
gpuCheckError(cudaEventRecord(endIODown[iBatch], stream));
}
([&] { gpuInterface->unregisterBuffer(args.data()); }(), ...);
// gpuCheckError(cudaEventRecord(totalStop));
for (auto* tracksD : tracks_device) {
gpuInterface->freeDevice(tracksD);
Expand All @@ -214,11 +223,30 @@ void processBulk(const int nBlocks,
gpuInterface->freeDevice(results_device);
gpuInterface->unregisterBuffer(fitters.data());
gpuInterface->unregisterBuffer(results.data());
// float milliseconds = 0;
// gpuCheckError(cudaEventElapsedTime(&milliseconds, start, stop));
// LOGP(info, "Kernel run in: {} ms using {} blocks and {} threads.", milliseconds, nBlocks, nThreads);
// return results;
// Do benchmarks
gpuCheckError(cudaDeviceSynchronize());
for (int iBatch{0}; iBatch < nBatches; ++iBatch) {
gpuCheckError(cudaEventElapsedTime(&ioUp[iBatch], startIOUp[iBatch], endIOUp[iBatch]));
gpuCheckError(cudaEventElapsedTime(&kerElapsed[iBatch], startKer[iBatch], endKer[iBatch]));
gpuCheckError(cudaEventElapsedTime(&ioDown[iBatch], startIODown[iBatch], endIODown[iBatch]));
}
float totalUp = std::accumulate(ioUp.begin(), ioUp.end(), 0.f);
float totalDown = std::accumulate(ioDown.begin(), ioDown.end(), 0.f);
float totalKernels = std::accumulate(kerElapsed.begin(), kerElapsed.end(), 0.f);
LOGP(info, "Config: {} batches, {} blocks, {} threads", nBatches, nBlocks, nThreads);
LOGP(info, "Total I/O time: Up {} ms Avg {} ms, Down {} ms Avg {} ms", totalUp, totalUp / float(nBatches), totalDown, totalDown / (float)nBatches);
LOGP(info, "Total Kernel time: {} ms Avg {} ms", totalKernels, totalKernels / (float)nBatches);
for (int iBatch{0}; iBatch < nBatches; ++iBatch) {
gpuCheckError(cudaEventDestroy(startIOUp[iBatch]));
gpuCheckError(cudaEventDestroy(endIOUp[iBatch]));
gpuCheckError(cudaEventDestroy(startIODown[iBatch]));
gpuCheckError(cudaEventDestroy(endIODown[iBatch]));
gpuCheckError(cudaEventDestroy(startKer[iBatch]));
gpuCheckError(cudaEventDestroy(endKer[iBatch]));
}
}
template void processBulk(const int,
Expand Down
7 changes: 6 additions & 1 deletion Common/DCAFitter/GPU/cuda/GPUInterface.cu
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,17 @@
// granted to it by virtue of its status as an Intergovernmental Organization
// or submit itself to any jurisdiction.

/// \brief Helper interface to the GPU device, meant to be compatible with manual allocation/streams and GPUReconstruction ones.
/// \author [email protected]

#ifdef __HIPCC__
#include "hip/hip_runtime.h"
#else
#include <cuda.h>
#endif

#include <iostream>
#include <cstdlib>

#include "DeviceInterface/GPUInterface.h"

Expand Down Expand Up @@ -75,7 +79,8 @@ GPUInterface* GPUInterface::sGPUInterface = nullptr;
GPUInterface* GPUInterface::Instance()
{
if (sGPUInterface == nullptr) {
sGPUInterface = new GPUInterface(8); // FIXME: get some configurable param to do so.
const auto* envValue = std::getenv("GPUINTERFACE_NSTREAMS");
sGPUInterface = new GPUInterface(envValue == nullptr ? 8 : std::stoi(envValue));
}
return sGPUInterface;
}
Expand Down
Loading

0 comments on commit 9d34af4

Please sign in to comment.