Skip to content

Commit

Permalink
CP
Browse files Browse the repository at this point in the history
  • Loading branch information
mconcas committed Dec 5, 2024
1 parent afaf9fc commit f8824df
Show file tree
Hide file tree
Showing 7 changed files with 306 additions and 85 deletions.
11 changes: 8 additions & 3 deletions Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,10 @@ class TimeFrameGPU : public TimeFrame
void createCellsBuffers(const int);
void createCellsDevice();
void createCellsLUTDevice();
void createNeighboursDevice();
void createNeighboursIndexTablesDevice();
void createNeighboursDevice(const unsigned int& layer, std::vector<std::pair<int, int>>& neighbours);
void createNeighboursLUTDevice(const int, const unsigned int);
void createNeighboursDeviceArray();
void createTrackITSExtDevice(std::vector<CellSeed>&);
void downloadTrackITSExtDevice(std::vector<CellSeed>&);
void downloadCellsNeighboursDevice(std::vector<std::vector<std::pair<int, int>>>&, const int);
Expand Down Expand Up @@ -113,7 +114,9 @@ class TimeFrameGPU : public TimeFrame
Road<nLayers - 2>* getDeviceRoads() { return mRoadsDevice; }
TrackITSExt* getDeviceTrackITSExt() { return mTrackITSExtDevice; }
int* getDeviceNeighboursLUT(const int layer) { return mNeighboursLUTDevice[layer]; }
gpuPair<int, int>* getDeviceNeighbours(const int layer) { return mNeighboursDevice[layer]; }
gpuPair<int, int>* getDeviceNeighbourPairs(const int layer) { return mNeighbourPairsDevice[layer]; }
int* getDeviceNeighbours(const int layer) { return mNeighboursDevice[layer]; }
int** getDeviceNeighboursArray() { return mNeighboursDeviceArray; }
TrackingFrameInfo* getDeviceTrackingFrameInfo(const int);
const TrackingFrameInfo** getDeviceArrayTrackingFrameInfo() const { return mTrackingFrameInfoDeviceArray; }
const Cluster** getDeviceArrayClusters() const { return mClustersDeviceArray; }
Expand Down Expand Up @@ -195,7 +198,9 @@ class TimeFrameGPU : public TimeFrame

Road<nLayers - 2>* mRoadsDevice;
TrackITSExt* mTrackITSExtDevice;
std::array<gpuPair<int, int>*, nLayers - 2> mNeighboursDevice;
std::array<gpuPair<int, int>*, nLayers - 2> mNeighbourPairsDevice;
std::array<int*, nLayers - 2> mNeighboursDevice;
int** mNeighboursDeviceArray;
std::array<TrackingFrameInfo*, nLayers> mTrackingFrameInfoDevice;
const TrackingFrameInfo** mTrackingFrameInfoDeviceArray;

Expand Down
27 changes: 24 additions & 3 deletions Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -175,9 +175,30 @@ void computeCellNeighboursHandler(CellSeed** cellsLayersDevice,
const int nBlocks,
const int nThreads);

void filterCellNeighboursHandler(std::vector<int>&,
gpuPair<int, int>*,
unsigned int);
int filterCellNeighboursHandler(std::vector<int>&,
gpuPair<int, int>*,
int*,
unsigned int);

void processNeighboursHandler(const int iteration,
const int startLayer,
const int startLevel,
CellSeed** currentCellSeeds,
const unsigned int nCurrentCells,
// const int* currentCellIds,
// const unsigned int nCurrentCellsIds,
// CellSeed* updatedCellSeeds,
// int* updatedCellsIds,
const unsigned char** usedClusters, // Used clusters
int* neighbours,
int* neighboursLUT,
const TrackingFrameInfo** foundTrackingFrameInfo,
const float bz,
const float MaxChi2ClusterAttachment,
const o2::base::Propagator* propagator,
const o2::base::PropagatorF::MatCorrType matCorrType,
const int nBlocks,
const int nThreads);

void trackSeedHandler(CellSeed* trackSeeds,
const TrackingFrameInfo** foundTrackingFrameInfo,
Expand Down
2 changes: 1 addition & 1 deletion Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
if(CUDA_ENABLED)
find_package(CUDAToolkit)
message(STATUS "Building ITS CUDA tracker")
# add_compile_options(-O0 -g -lineinfo -fPIC)
add_compile_options(-O0 -g -lineinfo -fPIC)
# add_compile_definitions(ITS_MEASURE_GPU_TIME)
o2_add_library(ITStrackingCUDA
SOURCES ClusterLinesGPU.cu
Expand Down
38 changes: 25 additions & 13 deletions Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
Original file line number Diff line number Diff line change
Expand Up @@ -306,17 +306,28 @@ void TimeFrameGPU<nLayers>::loadTrackletsLUTDevice()
}

template <int nLayers>
void TimeFrameGPU<nLayers>::createNeighboursDevice()
void TimeFrameGPU<nLayers>::createNeighboursIndexTablesDevice()
{
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading cell seeds");
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells neighbours");
// Here we do also the creation of the CellsDeviceArray, as the cells buffers are populated separately in the previous steps.
allocMemAsync(reinterpret_cast<void**>(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), nullptr, getExtAllocator());
checkGPUError(cudaHostRegister(mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaHostRegisterPortable));
checkGPUError(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) {
LOGP(debug, "gpu-transfer: loading neighbours LUT for {} elements on layer {}, for {} MB.", mNCells[iLayer], iLayer, mNCells[iLayer] * sizeof(CellSeed) / MB);
allocMemAsync(reinterpret_cast<void**>(&mNeighboursIndexTablesDevice[iLayer]), (mNCells[iLayer] + 1) * sizeof(int), nullptr, getExtAllocator());
checkGPUError(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (mNCells[iLayer] + 1) * sizeof(int), mGpuStreams[0].get()));
}
allocMemAsync(reinterpret_cast<void**>(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), nullptr, getExtAllocator());
checkGPUError(cudaHostRegister(mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaHostRegisterPortable));
checkGPUError(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
}

template <int nLayers>
void TimeFrameGPU<nLayers>::createNeighboursLUTDevice(const int layer, const unsigned int nCells)
{
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighboursLUT");
LOGP(debug, "gpu-allocation: reserving neighbours LUT for {} elements on layer {} , for {} MB.", nCells + 1, layer, (nCells + 1) * sizeof(int) / MB);
allocMemAsync(reinterpret_cast<void**>(&mNeighboursLUTDevice[layer]), (nCells + 1) * sizeof(int), nullptr, getExtAllocator()); // We need one element more to move exc -> inc
checkGPUError(cudaMemsetAsync(mNeighboursLUTDevice[layer], 0, (nCells + 1) * sizeof(int), mGpuStreams[0].get()));
STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
}

Expand Down Expand Up @@ -400,19 +411,20 @@ void TimeFrameGPU<nLayers>::createNeighboursDevice(const unsigned int& layer, st
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighbours");
mCellsNeighbours[layer].clear();
mCellsNeighbours[layer].resize(neighbours.size());
LOGP(debug, "gpu-allocation: reserving {} neighbours (pairs), for {} MB.", neighbours.size(), neighbours.size() * sizeof(gpuPair<int, int>) / MB);
allocMemAsync(reinterpret_cast<void**>(&mNeighbourPairsDevice[layer]), neighbours.size() * sizeof(gpuPair<int, int>), &(mGpuStreams[0]), getExtAllocator());
checkGPUError(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, neighbours.size() * sizeof(gpuPair<int, int>), mGpuStreams[0].get()));
LOGP(debug, "gpu-allocation: reserving {} neighbours, for {} MB.", neighbours.size(), neighbours.size() * sizeof(gpuPair<int, int>) / MB);
allocMemAsync(reinterpret_cast<void**>(&mNeighboursDevice[layer]), neighbours.size() * sizeof(gpuPair<int, int>), &(mGpuStreams[0]), getExtAllocator());
checkGPUError(cudaMemsetAsync(mNeighboursDevice[layer], -1, neighbours.size() * sizeof(gpuPair<int, int>), mGpuStreams[0].get()));
allocMemAsync(reinterpret_cast<void**>(&mNeighboursDevice[layer]), neighbours.size() * sizeof(int), &(mGpuStreams[0]), getExtAllocator());
STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
}

template <int nLayers>
void TimeFrameGPU<nLayers>::createNeighboursLUTDevice(const int layer, const unsigned int nCells)
void TimeFrameGPU<nLayers>::createNeighboursDeviceArray()
{
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighboursLUT");
LOGP(debug, "gpu-allocation: reserving neighbours LUT for {} elements on layer {} , for {} MB.", nCells + 1, layer, (nCells + 1) * sizeof(int) / MB);
allocMemAsync(reinterpret_cast<void**>(&mNeighboursLUTDevice[layer]), (nCells + 1) * sizeof(int), nullptr, getExtAllocator()); // We need one element more to move exc -> inc
checkGPUError(cudaMemsetAsync(mNeighboursLUTDevice[layer], 0, (nCells + 1) * sizeof(int), mGpuStreams[0].get()));
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighbours");
allocMemAsync(reinterpret_cast<void**>(&mNeighboursDeviceArray), (nLayers - 2) * sizeof(int*), &(mGpuStreams[0]), getExtAllocator());
checkGPUError(cudaMemcpyAsync(mNeighboursDeviceArray, mNeighboursDevice.data(), (nLayers - 2) * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
}

Expand Down Expand Up @@ -459,7 +471,7 @@ void TimeFrameGPU<nLayers>::downloadCellsNeighboursDevice(std::vector<std::vecto
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), fmt::format("downloading neighbours from layer {}", layer));
LOGP(debug, "gpu-transfer: downloading {} neighbours, for {} MB.", neighbours[layer].size(), neighbours[layer].size() * sizeof(std::pair<int, int>) / MB);
// TODO: something less dangerous than assuming the same memory layout of std::pair and gpuPair... or not? :)
checkGPUError(cudaMemcpyAsync(neighbours[layer].data(), mNeighboursDevice[layer], neighbours[layer].size() * sizeof(gpuPair<int, int>), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
checkGPUError(cudaMemcpyAsync(neighbours[layer].data(), mNeighbourPairsDevice[layer], neighbours[layer].size() * sizeof(gpuPair<int, int>), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
}

template <int nLayers>
Expand Down
45 changes: 31 additions & 14 deletions Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ void TrackerTraitsGPU<nLayers>::computeCellsHybrid(const int iteration)
template <int nLayers>
void TrackerTraitsGPU<nLayers>::findCellsNeighboursHybrid(const int iteration)
{
mTimeFrameGPU->createNeighboursDevice();
mTimeFrameGPU->createNeighboursIndexTablesDevice();
auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance();
std::vector<std::vector<std::pair<int, int>>> cellsNeighboursLayer(mTrkParams[iteration].CellsPerRoad() - 1);
for (int iLayer{0}; iLayer < mTrkParams[iteration].CellsPerRoad() - 1; ++iLayer) {
Expand All @@ -228,17 +228,16 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighboursHybrid(const int iteration)
continue;
}

int layerCellsNum{static_cast<int>(mTimeFrameGPU->getCells()[iLayer].size())};
mTimeFrameGPU->createNeighboursLUTDevice(iLayer, nextLayerCellsNum);
countCellNeighboursHandler(mTimeFrameGPU->getDeviceArrayCells(),
mTimeFrameGPU->getDeviceNeighboursLUT(iLayer), // LUT is initialised here.
mTimeFrameGPU->getDeviceArrayCellsLUT(),
mTimeFrameGPU->getDeviceNeighbours(iLayer),
mTimeFrameGPU->getDeviceNeighbourPairs(iLayer),
mTimeFrameGPU->getDeviceNeighboursIndexTables(iLayer),
mTrkParams[0].MaxChi2ClusterAttachment,
mBz,
iLayer,
layerCellsNum,
mTimeFrameGPU->getNCells()[iLayer],
nextLayerCellsNum,
1e2,
conf.nBlocks,
Expand All @@ -250,12 +249,12 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighboursHybrid(const int iteration)
computeCellNeighboursHandler(mTimeFrameGPU->getDeviceArrayCells(),
mTimeFrameGPU->getDeviceNeighboursLUT(iLayer),
mTimeFrameGPU->getDeviceArrayCellsLUT(),
mTimeFrameGPU->getDeviceNeighbours(iLayer),
mTimeFrameGPU->getDeviceNeighbourPairs(iLayer),
mTimeFrameGPU->getDeviceNeighboursIndexTables(iLayer),
mTrkParams[0].MaxChi2ClusterAttachment,
mBz,
iLayer,
layerCellsNum,
mTimeFrameGPU->getNCells()[iLayer],
nextLayerCellsNum,
1e2,
conf.nBlocks,
Expand All @@ -264,20 +263,38 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighboursHybrid(const int iteration)
mTimeFrameGPU->getCellsNeighbours()[iLayer].reserve(cellsNeighboursLayer[iLayer].size());

filterCellNeighboursHandler(mTimeFrameGPU->getCellsNeighbours()[iLayer],
mTimeFrameGPU->getDeviceNeighbourPairs(iLayer),
mTimeFrameGPU->getDeviceNeighbours(iLayer),
cellsNeighboursLayer[iLayer].size());
}
mTimeFrameGPU->createNeighboursDeviceArray();
mTimeFrameGPU->downloadCellsDevice();
mTimeFrameGPU->unregisterRest();
};

template <int nLayers>
void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
{
auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance();
for (int startLevel{mTrkParams[iteration].CellsPerRoad()}; startLevel >= mTrkParams[iteration].CellMinimumLevel(); --startLevel) {
const int minimumLayer{startLevel - 1};
std::vector<CellSeed> trackSeeds;
for (int startLayer{mTrkParams[iteration].CellsPerRoad() - 1}; startLayer >= minimumLayer; --startLayer) {
processNeighboursHandler(iteration,
startLayer,
startLevel,
mTimeFrameGPU->getDeviceArrayCells(),
mTimeFrameGPU->getNCells()[startLayer],
mTimeFrameGPU->getDeviceArrayUsedClusters(),
mTimeFrameGPU->getDeviceNeighbours(startLayer - 1),
mTimeFrameGPU->getDeviceNeighboursLUT(startLayer - 1),
mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(),
mBz,
mTrkParams[0].MaxChi2ClusterAttachment, // float maxChi2ClusterAttachment
mTimeFrameGPU->getDevicePropagator(),
mCorrType,
conf.nBlocks,
conf.nThreads);
std::vector<int> lastCellId, updatedCellId;
std::vector<CellSeed> lastCellSeed, updatedCellSeed;

Expand All @@ -304,15 +321,15 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
}
mTimeFrameGPU->createTrackITSExtDevice(trackSeeds);
mTimeFrameGPU->loadTrackSeedsDevice(trackSeeds);
auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance();
trackSeedHandler(mTimeFrameGPU->getDeviceTrackSeeds(), // CellSeed* trackSeeds,
mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(), // TrackingFrameInfo** foundTrackingFrameInfo,
mTimeFrameGPU->getDeviceTrackITSExt(), // o2::its::TrackITSExt* tracks,
trackSeeds.size(), // const size_t nSeeds,
mBz, // const float Bz,

trackSeedHandler(mTimeFrameGPU->getDeviceTrackSeeds(), // CellSeed* trackSeeds
mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(), // TrackingFrameInfo** foundTrackingFrameInfo
mTimeFrameGPU->getDeviceTrackITSExt(), // o2::its::TrackITSExt* tracks
trackSeeds.size(), // const size_t nSeeds
mBz, // const float Bz
startLevel, // const int startLevel,
mTrkParams[0].MaxChi2ClusterAttachment, // float maxChi2ClusterAttachment,
mTrkParams[0].MaxChi2NDF, // float maxChi2NDF,
mTrkParams[0].MaxChi2ClusterAttachment, // float maxChi2ClusterAttachment
mTrkParams[0].MaxChi2NDF, // float maxChi2NDF
mTimeFrameGPU->getDevicePropagator(), // const o2::base::Propagator* propagator
mCorrType, // o2::base::PropagatorImpl<float>::MatCorrType
conf.nBlocks,
Expand Down
Loading

0 comments on commit f8824df

Please sign in to comment.