Skip to content

Commit

Permalink
Checkpointing
Browse files Browse the repository at this point in the history
  • Loading branch information
mconcas committed Nov 20, 2024
1 parent 921415d commit 51e7404
Show file tree
Hide file tree
Showing 7 changed files with 269 additions and 152 deletions.
13 changes: 11 additions & 2 deletions Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,14 @@ class TimeFrameGPU : public TimeFrame
void initialise(const int, const TrackingParameters&, const int, IndexTableUtils* utils = nullptr, const TimeFrameGPUParameters* pars = nullptr);
void initDevice(IndexTableUtils*, const TrackingParameters& trkParam, const TimeFrameGPUParameters&, const int, const int);
void initDeviceSAFitting();
void loadIndexTableUtils(const int);
void loadTrackingFrameInfoDevice(const int);
void loadUnsortedClustersDevice(const int);
void loadClustersDevice(const int);
void loadROframeClustersDevice(const int iteration);
void loadClustersIndexTables(const int iteration);
void createUsedClustersDevice(const int);
void loadUsedClustersDevice();
void loadROframeClustersDevice(const int);
void loadMultiplicityCutMask(const int);
void loadVertices(const int);

Expand Down Expand Up @@ -112,6 +116,8 @@ class TimeFrameGPU : public TimeFrame
const TrackingFrameInfo** getDeviceArrayTrackingFrameInfo() const { return mTrackingFrameInfoDeviceArray; }
const Cluster** getDeviceArrayClusters() const { return mClustersDeviceArray; }
const Cluster** getDeviceArrayUnsortedClusters() const { return mUnsortedClustersDeviceArray; }
const int** getDeviceArrayClustersIndexTables() const { return mClustersIndexTablesDeviceArray; }
const unsigned char** getDeviceArrayUsedClusters() const { return mUsedClustersDeviceArray; }
const int** getDeviceROframeClusters() const { return mROFrameClustersDeviceArray; }
const Tracklet** getDeviceArrayTracklets() const { return mTrackletsDeviceArray; }
const int** getDeviceArrayTrackletsLUT() const { return mTrackletsLUTDeviceArray; }
Expand Down Expand Up @@ -148,17 +154,20 @@ class TimeFrameGPU : public TimeFrame
// Device pointers
StaticTrackingParameters<nLayers>* mTrackingParamsDevice;
IndexTableUtils* mIndexTableUtilsDevice;
std::array<unsigned char*, nLayers> mUsedClustersDevice;

// Hybrid pref
uint8_t* mMultMaskDevice;
Vertex* mPrimaryVerticesDevice;
int* mROFramesPVDevice;
std::array<Cluster*, nLayers> mClustersDevice;
std::array<Cluster*, nLayers> mUnsortedClustersDevice;
std::array<int*, nLayers> mClustersIndexTablesDevice;
std::array<unsigned char*, nLayers> mUsedClustersDevice;
std::array<int*, nLayers> mROFramesClustersDevice;
const Cluster** mClustersDeviceArray;
const Cluster** mUnsortedClustersDeviceArray;
const int** mClustersIndexTablesDeviceArray;
const unsigned char** mUsedClustersDeviceArray;
const int** mROFrameClustersDeviceArray;
std::array<Tracklet*, nLayers - 1> mTrackletsDevice;
const Tracklet** mTrackletsDeviceArray;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@ GPUg() void fitTrackSeedsKernel(
} // namespace gpu

template <int nLayers = 7>
void computeTrackletsInRofsHandler(const uint8_t* multMask,
void computeTrackletsInROFsHandler(const IndexTableUtils* utils,
const uint8_t* multMask,
const int startROF,
const int endROF,
const int maxROF,
Expand All @@ -62,6 +63,15 @@ void computeTrackletsInRofsHandler(const uint8_t* multMask,
const int nVertices,
const Cluster** clusters,
const int** ROFClusters,
const unsigned char** usedClusters,
const int** clustersIndexTables,
const int iteration,
const float NSigmaCut,
std::vector<float>& phiCuts,
const float resolutionPV,
std::vector<float>& minR,
std::vector<float>& maxR,
std::vector<float>& resolutions,
std::vector<float>& radii,
std::vector<float>& mulScatAng,
const int nBlocks,
Expand Down
16 changes: 8 additions & 8 deletions Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,9 @@ struct gpuSpan {
using ref = T&;

GPUd() gpuSpan() : _data(nullptr), _size(0) {}
GPUd() gpuSpan(ptr data, std::size_t dim) : _data(data), _size(dim) {}
GPUd() ref operator[](std::size_t idx) const { return _data[idx]; }
GPUd() std::size_t size() const { return _size; }
GPUd() gpuSpan(ptr data, unsigned int dim) : _data(data), _size(dim) {}
GPUd() ref operator[](unsigned int idx) const { return _data[idx]; }
GPUd() unsigned int size() const { return _size; }
GPUd() bool empty() const { return _size == 0; }
GPUd() ref front() const { return _data[0]; }
GPUd() ref back() const { return _data[_size - 1]; }
Expand All @@ -50,7 +50,7 @@ struct gpuSpan {

protected:
ptr _data;
std::size_t _size;
unsigned int _size;
};

template <typename T>
Expand All @@ -60,10 +60,10 @@ struct gpuSpan<const T> {
using ref = const T&;

GPUd() gpuSpan() : _data(nullptr), _size(0) {}
GPUd() gpuSpan(ptr data, std::size_t dim) : _data(data), _size(dim) {}
GPUd() gpuSpan(ptr data, unsigned int dim) : _data(data), _size(dim) {}
GPUd() gpuSpan(const gpuSpan<T>& other) : _data(other._data), _size(other._size) {}
GPUd() ref operator[](std::size_t idx) const { return _data[idx]; }
GPUd() std::size_t size() const { return _size; }
GPUd() ref operator[](unsigned int idx) const { return _data[idx]; }
GPUd() unsigned int size() const { return _size; }
GPUd() bool empty() const { return _size == 0; }
GPUd() ref front() const { return _data[0]; }
GPUd() ref back() const { return _data[_size - 1]; }
Expand All @@ -72,7 +72,7 @@ struct gpuSpan<const T> {

protected:
ptr _data;
std::size_t _size;
unsigned int _size;
};

enum class Task {
Expand Down
64 changes: 60 additions & 4 deletions Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,19 @@ void TimeFrameGPU<nLayers>::setDevicePropagator(const o2::base::PropagatorImpl<f
mPropagatorDevice = propagator;
}

template <int nLayers>
void TimeFrameGPU<nLayers>::loadIndexTableUtils(const int iteration)
{
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading indextable utils");
if (!iteration) {
LOGP(debug, "gpu-allocation: allocating IndexTableUtils buffer, for {} MB.", sizeof(IndexTableUtils) / MB);
allocMemAsync(reinterpret_cast<void**>(&mIndexTableUtilsDevice), sizeof(IndexTableUtils), nullptr, getExtAllocator());
}
LOGP(debug, "gpu-transfer: loading IndexTableUtils object, for {} MB.", sizeof(IndexTableUtils) / MB);
checkGPUError(cudaMemcpyAsync(mIndexTableUtilsDevice, &mIndexTableUtils, sizeof(IndexTableUtils), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
}

template <int nLayers>
void TimeFrameGPU<nLayers>::loadUnsortedClustersDevice(const int iteration)
{
Expand Down Expand Up @@ -128,13 +141,56 @@ void TimeFrameGPU<nLayers>::loadClustersDevice(const int iteration)
}
}

template <int nLayers>
void TimeFrameGPU<nLayers>::loadClustersIndexTables(const int iteration)
{
if (!iteration) {
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading sorted clusters");
for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
LOGP(info, "gpu-transfer: loading clusters indextable for layer {} with {} elements, for {} MB.", iLayer, mIndexTables[iLayer].size(), mIndexTables[iLayer].size() * sizeof(int) / MB);
allocMemAsync(reinterpret_cast<void**>(&mClustersIndexTablesDevice[iLayer]), mIndexTables[iLayer].size() * sizeof(int), nullptr, getExtAllocator());
checkGPUError(cudaMemcpyAsync(mClustersIndexTablesDevice[iLayer], mIndexTables[iLayer].data(), mIndexTables[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
}
allocMemAsync(reinterpret_cast<void**>(&mClustersIndexTablesDeviceArray), nLayers * sizeof(int), nullptr, getExtAllocator());
checkGPUError(cudaMemcpyAsync(mClustersIndexTablesDeviceArray, mClustersIndexTablesDevice.data(), nLayers * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
}
}

template <int nLayers>
void TimeFrameGPU<nLayers>::createUsedClustersDevice(const int iteration)
{
if (!iteration) {
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating used clusters flags");
for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
LOGP(debug, "gpu-transfer: creating {} used clusters flags on layer {}, for {} MB.", mUsedClusters[iLayer].size(), iLayer, mUsedClusters[iLayer].size() * sizeof(unsigned char) / MB);
allocMemAsync(reinterpret_cast<void**>(&mUsedClustersDevice[iLayer]), mUsedClusters[iLayer].size() * sizeof(unsigned char), nullptr, getExtAllocator());
checkGPUError(cudaMemsetAsync(mUsedClustersDevice[iLayer], 0, mUsedClusters[iLayer].size() * sizeof(unsigned char), mGpuStreams[0].get()));
}
allocMemAsync(reinterpret_cast<void**>(&mUsedClustersDeviceArray), nLayers * sizeof(unsigned char*), nullptr, getExtAllocator());
checkGPUError(cudaMemcpyAsync(mUsedClustersDeviceArray, mUsedClustersDevice.data(), nLayers * sizeof(unsigned char*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
}
}

template <int nLayers>
void TimeFrameGPU<nLayers>::loadUsedClustersDevice()
{
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating used clusters flags");
for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
LOGP(debug, "gpu-transfer: loading {} used clusters flags on layer {}, for {} MB.", mUsedClusters[iLayer].size(), iLayer, mClusters[iLayer].size() * sizeof(unsigned char) / MB);
checkGPUError(cudaMemcpyAsync(mUsedClustersDevice[iLayer], mUsedClusters[iLayer].data(), mUsedClusters[iLayer].size() * sizeof(unsigned char), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
}
STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
}

template <int nLayers>
void TimeFrameGPU<nLayers>::loadROframeClustersDevice(const int iteration)
{
if (!iteration) {
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading ROframe clusters");
for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
LOGP(info, "gpu-transfer: loading {} ROframe clusters info on layer {}, for {} MB.", mROFramesClusters[iLayer].size(), iLayer, mROFramesClusters[iLayer].size() * sizeof(int) / MB);
LOGP(debug, "gpu-transfer: loading {} ROframe clusters info on layer {}, for {} MB.", mROFramesClusters[iLayer].size(), iLayer, mROFramesClusters[iLayer].size() * sizeof(int) / MB);
allocMemAsync(reinterpret_cast<void**>(&mROFramesClustersDevice[iLayer]), mROFramesClusters[iLayer].size() * sizeof(int), nullptr, getExtAllocator());
checkGPUError(cudaMemcpyAsync(mROFramesClustersDevice[iLayer], mROFramesClusters[iLayer].data(), mROFramesClusters[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
}
Expand Down Expand Up @@ -167,7 +223,7 @@ void TimeFrameGPU<nLayers>::loadMultiplicityCutMask(const int iteration)
{
if (!iteration) {
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading multiplicity cut mask");
LOGP(info, "gpu-transfer: loading multiplicity cut mask with {} elements, for {} MB.", mMultiplicityCutMask.size(), mMultiplicityCutMask.size() * sizeof(bool) / MB);
LOGP(debug, "gpu-transfer: loading multiplicity cut mask with {} elements, for {} MB.", mMultiplicityCutMask.size(), mMultiplicityCutMask.size() * sizeof(bool) / MB);
allocMemAsync(reinterpret_cast<void**>(&mMultMaskDevice), mMultiplicityCutMask.size() * sizeof(uint8_t), nullptr, getExtAllocator());
checkGPUError(cudaMemcpyAsync(mMultMaskDevice, mMultiplicityCutMask.data(), mMultiplicityCutMask.size() * sizeof(uint8_t), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
Expand All @@ -179,10 +235,10 @@ void TimeFrameGPU<nLayers>::loadVertices(const int iteration)
{
if (!iteration) {
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading seeding vertices");
LOGP(info, "gpu-transfer: loading {} ROframes vertices, for {} MB.", mROFramesPV.size(), mROFramesPV.size() * sizeof(int) / MB);
LOGP(debug, "gpu-transfer: loading {} ROframes vertices, for {} MB.", mROFramesPV.size(), mROFramesPV.size() * sizeof(int) / MB);
allocMemAsync(reinterpret_cast<void**>(&mROFramesPVDevice), mROFramesPV.size() * sizeof(int), nullptr, getExtAllocator());
checkGPUError(cudaMemcpyAsync(mROFramesPVDevice, mROFramesPV.data(), mROFramesPV.size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
LOGP(info, "gpu-transfer: loading {} seeding vertices, for {} MB.", mPrimaryVertices.size(), mPrimaryVertices.size() * sizeof(Vertex) / MB);
LOGP(debug, "gpu-transfer: loading {} seeding vertices, for {} MB.", mPrimaryVertices.size(), mPrimaryVertices.size() * sizeof(Vertex) / MB);
allocMemAsync(reinterpret_cast<void**>(&mPrimaryVerticesDevice), mPrimaryVertices.size() * sizeof(Vertex), nullptr, getExtAllocator());
checkGPUError(cudaMemcpyAsync(mPrimaryVerticesDevice, mPrimaryVertices.data(), mPrimaryVertices.size() * sizeof(Vertex), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
Expand Down
16 changes: 15 additions & 1 deletion Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,13 @@ void TrackerTraitsGPU<nLayers>::initialiseTimeFrame(const int iteration)
mTimeFrameGPU->initialise(iteration, mTrkParams[iteration], nLayers);
mTimeFrameGPU->loadClustersDevice(iteration);
mTimeFrameGPU->loadUnsortedClustersDevice(iteration);
mTimeFrameGPU->loadClustersIndexTables(iteration);
mTimeFrameGPU->loadTrackingFrameInfoDevice(iteration);
mTimeFrameGPU->loadMultiplicityCutMask(iteration);
mTimeFrameGPU->loadVertices(iteration);
mTimeFrameGPU->loadROframeClustersDevice(iteration);
mTimeFrameGPU->createUsedClustersDevice(iteration);
mTimeFrameGPU->loadIndexTableUtils(iteration);
}

template <int nLayers>
Expand Down Expand Up @@ -95,7 +98,8 @@ void TrackerTraitsGPU<nLayers>::computeTrackletsHybrid(const int iteration, int
int startROF{mTrkParams[iteration].nROFsPerIterations > 0 ? iROFslice * mTrkParams[iteration].nROFsPerIterations : 0};
int endROF{mTrkParams[iteration].nROFsPerIterations > 0 ? (iROFslice + 1) * mTrkParams[iteration].nROFsPerIterations + mTrkParams[iteration].DeltaROF : mTimeFrameGPU->getNrof()};

computeTrackletsInRofsHandler<nLayers>(mTimeFrameGPU->getDeviceMultCutMask(),
computeTrackletsInROFsHandler<nLayers>(mTimeFrameGPU->getDeviceIndexTableUtils(),
mTimeFrameGPU->getDeviceMultCutMask(),
startROF,
endROF,
mTimeFrameGPU->getNrof(),
Expand All @@ -106,6 +110,15 @@ void TrackerTraitsGPU<nLayers>::computeTrackletsHybrid(const int iteration, int
mTimeFrameGPU->getPrimaryVerticesNum(),
mTimeFrameGPU->getDeviceArrayClusters(),
mTimeFrameGPU->getDeviceROframeClusters(),
mTimeFrameGPU->getDeviceArrayUsedClusters(),
mTimeFrameGPU->getDeviceArrayClustersIndexTables(),
iteration,
mTrkParams[iteration].NSigmaCut,
mTimeFrameGPU->getPhiCuts(),
mTrkParams[iteration].PVres,
mTimeFrameGPU->getMinRs(),
mTimeFrameGPU->getMaxRs(),
mTimeFrameGPU->getPositionResolutions(),
mTrkParams[iteration].LayerRadii,
mTimeFrameGPU->getMSangles(),
conf.nBlocks,
Expand Down Expand Up @@ -324,6 +337,7 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
mTimeFrame->getTracks(std::min(rofs[0], rofs[1])).emplace_back(track);
}
}
mTimeFrameGPU->loadUsedClustersDevice();
if (iteration == mTrkParams.size() - 1) {
mTimeFrameGPU->unregisterHostMemory(0);
}
Expand Down
Loading

0 comments on commit 51e7404

Please sign in to comment.