Skip to content

Commit

Permalink
tracklets on gpu
Browse files Browse the repository at this point in the history
  • Loading branch information
mconcas committed Dec 2, 2024
1 parent 208ea84 commit ff486d4
Show file tree
Hide file tree
Showing 6 changed files with 66 additions and 70 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -138,13 +138,14 @@ class TimeFrameGPU : public TimeFrame
// Host-specific getters
gsl::span<int> getHostNTracklets(const int chunkId);
gsl::span<int> getHostNCells(const int chunkId);
gsl::span<int, nLayers - 1> getNTracklets() { return mNTracklets; }
gsl::span<int, nLayers - 2> getNCells() { return mNCells; }

// Host-available device getters
gsl::span<int*> getDeviceTrackletsLUTs() { return mTrackletsLUTDevice; }
gsl::span<int*> getDeviceCellLUTs() { return mCellsLUTDevice; }
gsl::span<Tracklet*> getDeviceTracklet() { return mTrackletsDevice; }
gsl::span<CellSeed*> getDeviceCells() { return mCellsDevice; }
gsl::span<int, nLayers - 2> getNCellsDevice() { return mNCells; }

private:
void allocMemAsync(void**, size_t, Stream*, bool); // Abstract owned and unowned memory allocations
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,10 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils,
const unsigned char** usedClusters,
const int** clustersIndexTables,
Tracklet** tracklets,
gsl::span<Tracklet*> spanTracklets,
gsl::span<int> nTracklets,
int** trackletsLUTs,
gsl::span<int*> trackletsLUTsHost,
const int iteration,
const float NSigmaCut,
std::vector<float>& phiCuts,
Expand Down
7 changes: 0 additions & 7 deletions Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
Original file line number Diff line number Diff line change
Expand Up @@ -489,13 +489,6 @@ void TimeFrameGPU<nLayers>::unregisterRest()
LOGP(debug, "unregistering rest of the host memory...");
checkGPUError(cudaHostUnregister(mCellsDevice.data()));
checkGPUError(cudaHostUnregister(mTrackletsDevice.data()));
checkGPUError(cudaHostUnregister(mTrackletsLUTDevice.data()));
for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) {
if (iLayer < nLayers - 2) {
checkGPUError(cudaHostUnregister(mTrackletsLookupTable[iLayer].data()));
}
checkGPUError(cudaHostUnregister(mTracklets[iLayer].data()));
}
STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
}

Expand Down
32 changes: 16 additions & 16 deletions Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,10 @@ void TrackerTraitsGPU<nLayers>::computeTrackletsHybrid(const int iteration, int
mTimeFrameGPU->getDeviceArrayUsedClusters(),
mTimeFrameGPU->getDeviceArrayClustersIndexTables(),
mTimeFrameGPU->getDeviceArrayTracklets(),
mTimeFrameGPU->getDeviceTracklet(),
mTimeFrameGPU->getNTracklets(),
mTimeFrameGPU->getDeviceArrayTrackletsLUT(),
mTimeFrameGPU->getDeviceTrackletsLUTs(),
iteration,
mTrkParams[iteration].NSigmaCut,
mTimeFrameGPU->getPhiCuts(),
Expand All @@ -161,25 +164,22 @@ void TrackerTraitsGPU<nLayers>::computeTrackletsHybrid(const int iteration, int
template <int nLayers>
void TrackerTraitsGPU<nLayers>::computeCellsHybrid(const int iteration)
{
mTimeFrameGPU->loadTrackletsDevice();
mTimeFrameGPU->loadTrackletsLUTDevice();
mTimeFrameGPU->createCellsLUTDevice();
auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance();

// #pragma omp parallel for num_threads(nLayers)
for (int iLayer = 0; iLayer < mTrkParams[iteration].CellsPerRoad(); ++iLayer) {
if (mTimeFrameGPU->getTracklets()[iLayer + 1].empty() ||
mTimeFrameGPU->getTracklets()[iLayer].empty()) {
if (!mTimeFrameGPU->getNTracklets()[iLayer + 1] || !mTimeFrameGPU->getNTracklets()[iLayer]) {
LOGP(info, "continuing here");
continue;
}

const int currentLayerTrackletsNum{static_cast<int>(mTimeFrameGPU->getTracklets()[iLayer].size())};
LOGP(info, "+> {}", mTimeFrameGPU->getNTracklets()[iLayer]);
const int currentLayerTrackletsNum{static_cast<int>(mTimeFrameGPU->getNTracklets()[iLayer])};
countCellsHandler(mTimeFrameGPU->getDeviceArrayClusters(),
mTimeFrameGPU->getDeviceArrayUnsortedClusters(),
mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(),
mTimeFrameGPU->getDeviceArrayTracklets(),
mTimeFrameGPU->getDeviceArrayTrackletsLUT(),
mTimeFrameGPU->getTracklets()[iLayer].size(),
mTimeFrameGPU->getNTracklets()[iLayer],
iLayer,
nullptr,
mTimeFrameGPU->getDeviceArrayCellsLUT(),
Expand All @@ -196,7 +196,7 @@ void TrackerTraitsGPU<nLayers>::computeCellsHybrid(const int iteration)
mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(),
mTimeFrameGPU->getDeviceArrayTracklets(),
mTimeFrameGPU->getDeviceArrayTrackletsLUT(),
mTimeFrameGPU->getTracklets()[iLayer].size(),
mTimeFrameGPU->getNTracklets()[iLayer],
iLayer,
mTimeFrameGPU->getDeviceCells()[iLayer],
mTimeFrameGPU->getDeviceArrayCellsLUT(),
Expand All @@ -220,7 +220,7 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighboursHybrid(const int iteration)
auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance();
std::vector<std::vector<std::pair<int, int>>> cellsNeighboursLayer(mTrkParams[iteration].CellsPerRoad() - 1);
for (int iLayer{0}; iLayer < mTrkParams[iteration].CellsPerRoad() - 1; ++iLayer) {
const int nextLayerCellsNum{static_cast<int>(mTimeFrameGPU->getNCellsDevice()[iLayer + 1])};
const int nextLayerCellsNum{static_cast<int>(mTimeFrameGPU->getNCells()[iLayer + 1])};
mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].clear();
mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].resize(nextLayerCellsNum, 0);

Expand Down Expand Up @@ -283,7 +283,7 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
std::vector<int> lastCellId, updatedCellId;
std::vector<CellSeed> lastCellSeed, updatedCellSeed;

processNeighbours(startLayer, startLevel, mTimeFrame->getCells()[startLayer], lastCellId, updatedCellSeed, updatedCellId);
processNeighbours(startLayer, startLevel, mTimeFrameGPU->getCells()[startLayer], lastCellId, updatedCellSeed, updatedCellId);

int level = startLevel;
for (int iLayer{startLayer - 1}; iLayer > 0 && level > 2; --iLayer) {
Expand Down Expand Up @@ -337,8 +337,8 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
if (track.getClusterIndex(iLayer) == UnusedIndex) {
continue;
}
nShared += int(mTimeFrame->isClusterUsed(iLayer, track.getClusterIndex(iLayer)));
isFirstShared |= !iLayer && mTimeFrame->isClusterUsed(iLayer, track.getClusterIndex(iLayer));
nShared += int(mTimeFrameGPU->isClusterUsed(iLayer, track.getClusterIndex(iLayer)));
isFirstShared |= !iLayer && mTimeFrameGPU->isClusterUsed(iLayer, track.getClusterIndex(iLayer));
}

if (nShared > mTrkParams[0].ClusterSharing) {
Expand All @@ -350,8 +350,8 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
if (track.getClusterIndex(iLayer) == UnusedIndex) {
continue;
}
mTimeFrame->markUsedCluster(iLayer, track.getClusterIndex(iLayer));
int currentROF = mTimeFrame->getClusterROF(iLayer, track.getClusterIndex(iLayer));
mTimeFrameGPU->markUsedCluster(iLayer, track.getClusterIndex(iLayer));
int currentROF = mTimeFrameGPU->getClusterROF(iLayer, track.getClusterIndex(iLayer));
for (int iR{0}; iR < 3; ++iR) {
if (rofs[iR] == INT_MAX) {
rofs[iR] = currentROF;
Expand All @@ -367,7 +367,7 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
if (rofs[1] != INT_MAX) {
track.setNextROFbit();
}
mTimeFrame->getTracks(std::min(rofs[0], rofs[1])).emplace_back(track);
mTimeFrameGPU->getTracks(std::min(rofs[0], rofs[1])).emplace_back(track);
}
}
mTimeFrameGPU->loadUsedClustersDevice();
Expand Down
49 changes: 43 additions & 6 deletions Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,17 @@ GPUd() o2::track::TrackParCov buildTrackSeed(const Cluster& cluster1,
0.f, 0.f, 0.f, 0.f, sg2q2pt});
}

// auto sort_tracklets = [] GPUhdni()(const Tracklet& a, const Tracklet& b) { return a.firstClusterIndex < b.firstClusterIndex || (a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex < b.secondClusterIndex); };
// auto equal_tracklets = [] GPUhdni()(const Tracklet& a, const Tracklet& b) { return a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex == b.secondClusterIndex; };

struct sort_tracklets {
GPUhd() bool operator()(const Tracklet& a, const Tracklet& b) { return a.firstClusterIndex < b.firstClusterIndex || (a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex < b.secondClusterIndex); }
};

struct equal_tracklets {
GPUhd() bool operator()(const Tracklet& a, const Tracklet& b) { return a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex == b.secondClusterIndex; }
};

template <typename T1, typename T2>
struct pair_to_first : public thrust::unary_function<gpuPair<T1, T2>, T1> {
GPUhd() int operator()(const gpuPair<T1, T2>& a) const
Expand Down Expand Up @@ -686,10 +697,7 @@ GPUg() void compileTrackletsLookupTableKernel(const Tracklet* tracklets,
const int nTracklets)
{
for (int currentTrackletIndex = blockIdx.x * blockDim.x + threadIdx.x; currentTrackletIndex < nTracklets; currentTrackletIndex += blockDim.x * gridDim.x) {
auto& tracklet{tracklets[currentTrackletIndex]};
if (tracklet.firstClusterIndex >= 0) {
atomicAdd(trackletsLookUpTable + tracklet.firstClusterIndex, 1);
}
atomicAdd(&trackletsLookUpTable[tracklets[currentTrackletIndex].firstClusterIndex], 1);
}
}

Expand Down Expand Up @@ -808,7 +816,10 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils,
const unsigned char** usedClusters,
const int** clustersIndexTables,
Tracklet** tracklets,
gsl::span<Tracklet*> spanTracklets,
gsl::span<int> nTracklets,
int** trackletsLUTs,
gsl::span<int*> trackletsLUTsHost,
const int iteration,
const float NSigmaCut,
std::vector<float>& phiCuts,
Expand Down Expand Up @@ -848,8 +859,31 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils,
resolutions[iLayer],
radii[iLayer + 1] - radii[iLayer],
mulScatAng[iLayer]);
gpuCheckError(cudaPeekAtLastError());
gpuCheckError(cudaDeviceSynchronize());
thrust::device_ptr<Tracklet> tracklets_ptr(spanTracklets[iLayer]);
thrust::sort(thrust::device, tracklets_ptr, tracklets_ptr + nTracklets[iLayer], gpu::sort_tracklets());
auto unique_end = thrust::unique(thrust::device, tracklets_ptr, tracklets_ptr + nTracklets[iLayer], gpu::equal_tracklets());
nTracklets[iLayer] = unique_end - tracklets_ptr;
LOGP(info, "=> {} {}", nTracklets[iLayer], unique_end - tracklets_ptr);
if (iLayer > 0) {
gpuCheckError(cudaMemset(trackletsLUTsHost[iLayer], 0, nClusters[iLayer] * sizeof(int)));
gpu::compileTrackletsLookupTableKernel<<<nBlocks, nThreads>>>(spanTracklets[iLayer], trackletsLUTsHost[iLayer], nTracklets[iLayer]);
void* d_temp_storage = nullptr;
size_t temp_storage_bytes = 0;
gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage
temp_storage_bytes, // temp_storage_bytes
trackletsLUTsHost[iLayer], // d_in
trackletsLUTsHost[iLayer], // d_out
nClusters[iLayer] + 1, // num_items
0)); // NOLINT: this is the offset of the sum, not a pointer
discardResult(cudaMalloc(&d_temp_storage, temp_storage_bytes));
gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage
temp_storage_bytes, // temp_storage_bytes
trackletsLUTsHost[iLayer], // d_in
trackletsLUTsHost[iLayer], // d_out
nClusters[iLayer] + 1, // num_items
0)); // NOLINT: this is the offset of the sum, not a pointer
gpuCheckError(cudaFree(d_temp_storage));
}
}
}

Expand Down Expand Up @@ -1127,7 +1161,10 @@ template void computeTrackletsInROFsHandler<7>(const IndexTableUtils* utils,
const unsigned char** usedClusters,
const int** clustersIndexTables,
Tracklet** tracklets,
gsl::span<Tracklet*> spanTracklets,
gsl::span<int> nTracklets,
int** trackletsLUTs,
gsl::span<int*> trackletsLUTsHost,
const int iteration,
const float NSigmaCut,
std::vector<float>& phiCuts,
Expand Down
42 changes: 2 additions & 40 deletions Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -193,56 +193,17 @@ void TrackerTraits::computeLayerTracklets(const int iteration, int iROFslice, in
}
}
}
// if (rof0 == 81) {
// printf("CPU layer: %d -> %f %f %f %f %f %f %f %f\n",
// iLayer,
// mTrkParams[iteration].NSigmaCut,
// tf->getPhiCut(iLayer),
// mTrkParams[iteration].PVres,
// tf->getMinR(iLayer + 1),
// tf->getMaxR(iLayer + 1),
// tf->getPositionResolution(iLayer),
// meanDeltaR,
// tf->getMSangle(iLayer));
// }
}
}
if (!tf->checkMemory(mTrkParams[iteration].MaxMemory)) {
return;
}

// for (auto iLayer{0}; iLayer < tf->getTracklets().size(); ++iLayer) {
// std::cout << "tracklets layer " << iLayer << ": " << tf->getTracklets()[iLayer].size() << std::endl;
// }

// for (auto iLayer{0}; iLayer < tf->getTrackletsLookupTable().size(); ++iLayer) {
// auto lut = tf->getTrackletsLookupTable()[iLayer];
// for (unsigned int iC{0}; iC < lut.size(); ++iC) {
// if (!(iC % 150)) {
// printf("\n row %d: ===> %d/%d\t", iLayer, iC, (int)lut.size());
// }
// printf("%d\t", lut[iC]);
// }
// }

// for (auto rofId{0}; rofId < 2304; ++rofId) {
// int nClus = tf->getClustersOnLayer(rofId, 1).size();
// if (!nClus) {
// continue;
// }
// printf("rof: %d (%d) ==> ", rofId, nClus);

// for (int iC{0}; iC < nClus; ++iC) {
// int nT = tf->getTrackletsLookupTable()[0][tf->getSortedIndex(rofId, 1, iC)];
// printf("%d\t", nT);
// }
// printf("\n");
// }

#pragma omp parallel for num_threads(mNThreads)
for (int iLayer = 0; iLayer < mTrkParams[iteration].CellsPerRoad(); ++iLayer) {
/// Sort tracklets
auto& trkl{tf->getTracklets()[iLayer + 1]};
auto oldsize{trkl.size()};
std::sort(trkl.begin(), trkl.end(), [](const Tracklet& a, const Tracklet& b) {
return a.firstClusterIndex < b.firstClusterIndex || (a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex < b.secondClusterIndex);
});
Expand All @@ -265,6 +226,7 @@ void TrackerTraits::computeLayerTracklets(const int iteration, int iROFslice, in
/// Compute LUT
std::exclusive_scan(lut.begin(), lut.end(), lut.begin(), 0);
lut.push_back(trkl.size());
LOGP(info, "CPU layer {} -> old size: {} - new size: {}", iLayer, oldsize, trkl.size());
}
/// Layer 0 is done outside the loop
std::sort(tf->getTracklets()[0].begin(), tf->getTracklets()[0].end(), [](const Tracklet& a, const Tracklet& b) {
Expand Down

0 comments on commit ff486d4

Please sign in to comment.