Skip to content

Commit

Permalink
Simplify the interface
Browse files Browse the repository at this point in the history
  • Loading branch information
mconcas committed Oct 1, 2024
1 parent a54238d commit 8a86d08
Show file tree
Hide file tree
Showing 9 changed files with 73 additions and 118 deletions.
44 changes: 12 additions & 32 deletions Common/DCAFitter/GPU/DeviceInterface/GPUInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@

#include <thread>
#include <vector>
#include <atomic>

namespace o2
{
Expand All @@ -28,12 +27,12 @@ namespace device

#if !defined(__HIPCC__) && !defined(__CUDACC__)
typedef struct _dummyStream {
} stream;
} Stream;
#else
#ifdef __HIPCC__
typedef hipStream_t stream;
typedef hipStream_t Stream;
#else
typedef cudaStream_t stream;
typedef cudaStream_t Stream;
#endif
#endif

Expand All @@ -46,43 +45,24 @@ class GPUInterface
static GPUInterface* Instance();

// APIs
void register(void*, size_t);
void allocAsync(void**, size_t, unsigned short streamId = -1);
void registerBuffer(void*, size_t);
void unregisterBuffer(void* addr);
void allocDevice(void**, size_t);
void freeDevice(void*);
Stream& getStream(short N = 0);

protected:
GPUInterface(size_t N)
{
resize(N);
}
GPUInterface(size_t N = 1);
~GPUInterface();

void resize(size_t);
unsigned short getNextCursor();

static GPUInterface* sGPUInterface = nullptr;
std::atomic<unsigned short> mCursor{0};
static GPUInterface* sGPUInterface;
std::vector<std::thread> mPool{};
std::vector<stream> mStreams{};
std::vector<Stream> mStreams{};
};

inline void GPUInterface::resize(size_t N)
{
mPool.resize(N);
mStreams.resize(N);
}

inline unsigned short GPUInterface::getNextCursor()
{
auto index = mCursor++;

auto id = index % mPool.size();

auto oldValue = mCursor;
auto newValue = oldValue % mPool.size();
while (!mCursor.compare_exchange_weak(oldValue, newValue, std::memory_order_relaxed)) {
newValue = oldValue % mPool.size();
}
return id;
}
} // namespace device
} // namespace vertexing
} // namespace o2
Expand Down
24 changes: 0 additions & 24 deletions Common/DCAFitter/GPU/DeviceInterface/GPUInterfaceConfigParam.h

This file was deleted.

5 changes: 1 addition & 4 deletions Common/DCAFitter/GPU/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,7 @@ o2_add_library(DCAFitterCUDA
O2::DetectorsBase
PRIVATE_LINK_LIBRARIES O2::GPUTrackingCUDAExternalProvider)
set_property(TARGET ${targetName} PROPERTY CUDA_SEPARABLE_COMPILATION ON)

o2_target_root_dictionary(DCAFitterCUDA
HEADERS ../DeviceInterface/GPUInterfaceConfigParam.h
LINKDEF DCAFitterGPULinkDef.h)
add_compile_options(-lineinfo)

o2_add_test(DCAFitterNCUDA
SOURCES test/testDCAFitterNGPU.cxx
Expand Down
21 changes: 0 additions & 21 deletions Common/DCAFitter/GPU/cuda/DCAFitterGPULinkDef.h

This file was deleted.

23 changes: 14 additions & 9 deletions Common/DCAFitter/GPU/cuda/DCAFitterN.cu
Original file line number Diff line number Diff line change
Expand Up @@ -150,14 +150,16 @@ std::vector<int> processBulk(const int nBlocks,

int iArg{0};
([&] {
gpuInterface->register(args.data(), sizeof(Tr) * args.size());
gpuInterface->allocAsync(reinterpret_cast<void**>(&(tracks_device[iArg])), sizeof(Tr) * args.size());
gpuCheckError(cudaMemcpy(tracks_device[iArg], args.data(), sizeof(Tr) * args.size(), cudaMemcpyHostToDevice));
gpuInterface->registerBuffer(reinterpret_cast<void*>(args.data()), sizeof(Tr) * args.size());
gpuInterface->allocDevice(reinterpret_cast<void**>(&(tracks_device[iArg])), sizeof(Tr) * args.size());
gpuCheckError(cudaMemcpyAsync(tracks_device[iArg], args.data(), sizeof(Tr) * args.size(), cudaMemcpyHostToDevice, gpuInterface->getStream(iArg)));
++iArg;
}(),
...);
gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&results_device), sizeof(int) * nFits));
gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&fitters_device), sizeof(Fitter) * nFits));
gpuInterface->registerBuffer(reinterpret_cast<void*>(fitters.data()), sizeof(Fitter) * nFits);
gpuInterface->registerBuffer(reinterpret_cast<void*>(results.data()), sizeof(int) * nFits);
gpuInterface->allocDevice(reinterpret_cast<void**>(&results_device), sizeof(int) * nFits);
gpuInterface->allocDevice(reinterpret_cast<void**>(&fitters_device), sizeof(Fitter) * nFits);
gpuCheckError(cudaMemcpy(fitters_device, fitters.data(), sizeof(Fitter) * nFits, cudaMemcpyHostToDevice));

gpuCheckError(cudaEventRecord(start));
Expand All @@ -172,14 +174,17 @@ std::vector<int> processBulk(const int nBlocks,

iArg = 0;
([&] {
gpuCheckError(cudaMemcpy(args.data(), tracks_device[iArg], sizeof(Tr) * args.size(), cudaMemcpyDeviceToHost));
gpuCheckError(cudaFree(tracks_device[iArg]));
gpuCheckError(cudaMemcpyAsync(args.data(), tracks_device[iArg], sizeof(Tr) * args.size(), cudaMemcpyDeviceToHost, gpuInterface->getStream(iArg)));
gpuInterface->freeDevice(tracks_device[iArg]);
gpuInterface->unregisterBuffer(args.data());
++iArg;
}(),
...);

gpuCheckError(cudaFree(fitters_device));
gpuCheckError(cudaFree(results_device));
gpuInterface->freeDevice(fitters_device);
gpuInterface->freeDevice(results_device);
gpuInterface->unregisterBuffer(fitters.data());
gpuInterface->unregisterBuffer(results.data());
gpuCheckError(cudaEventSynchronize(stop));

float milliseconds = 0;
Expand Down
47 changes: 42 additions & 5 deletions Common/DCAFitter/GPU/cuda/GPUInterface.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
#include <iostream>

#include "DeviceInterface/GPUInterface.h"
#include "DeviceInterface/GPUInterfaceConfigParam.h"

#define gpuCheckError(x) \
{ \
Expand All @@ -36,21 +35,59 @@ inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort =

namespace o2::vertexing::device
{

GPUInterface::GPUInterface(size_t N)
{
resize(N);
for (auto& st : mStreams) {
gpuCheckError(cudaStreamCreate(&st));
}
}

GPUInterface::~GPUInterface()
{
for (auto& st : mStreams) {
gpuCheckError(cudaStreamDestroy(st));
}
}

void GPUInterface::resize(size_t N)
{
mPool.resize(N);
mStreams.resize(N);
}

void GPUInterface::registerBuffer(void* addr, size_t bufferSize)
{
gpuCheckError(cudaHostRegister(addr, bufferSize, cudaHostRegisterDefault));
}

void GPUInterface::unregisterBuffer(void* addr)
{
gpuCheckError(cudaHostUnregister(addr));
}

GPUInterface* GPUInterface::sGPUInterface = nullptr;
GPUInterface* GPUInterface::Instance()
{
if (sGPUInterface == nullptr) {
sGPUInterface = new GPUInterface(o2::vertexing::device::GPUInterfaceParamConfig::Instance().streamPoolSize);
sGPUInterface = new GPUInterface(8); // FIXME: get some configurable param to do so.
}
return sGPUInterface;
}

void GPUInterface::register(void* addr, size_t bufferSize)
void GPUInterface::allocDevice(void** addrPtr, size_t bufferSize)
{
gpuCheckError(cudaHostRegister(addr, bufferSize, cudaHostRegisterDefault));
gpuCheckError(cudaMalloc(addrPtr, bufferSize));
}

void GPUInterface::allocAsync(void** addrPtr, size_t bufferSize, unsigned short streamId) {
void GPUInterface::freeDevice(void* addr)
{
gpuCheckError(cudaFree(addr));
}

Stream& GPUInterface::getStream(short N)
{
return mStreams[N % mStreams.size()];
}
} // namespace o2::vertexing::device
19 changes: 0 additions & 19 deletions Common/DCAFitter/GPU/cuda/GPUInterfaceConfigParam.cxx

This file was deleted.

6 changes: 3 additions & 3 deletions Common/DCAFitter/GPU/cuda/test/testDCAFitterNGPU.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@
#include <Math/SVector.h>
#include <array>

#define nBlocks 60
#define nThreads 1024
#define NTest 1000000
#define nBlocks 30
#define nThreads 256
#define NTest 100000

namespace o2
{
Expand Down
2 changes: 1 addition & 1 deletion Common/DCAFitter/GPU/hip/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ o2_add_hipified_library(DCAFitterHIP
hip::host
PRIVATE_LINK_LIBRARIES O2::GPUTrackingHIPExternalProvider
TARGETVARNAME targetNAme)

o2_add_test(DCAFitterNHIP
SOURCES ../cuda/test/testDCAFitterNGPU.cxx
PUBLIC_LINK_LIBRARIES O2::ReconstructionDataFormats
Expand Down

0 comments on commit 8a86d08

Please sign in to comment.