Skip to content

Commit

Permalink
Fix update of the used clusters
Browse files Browse the repository at this point in the history
  • Loading branch information
mconcas committed Dec 6, 2024
1 parent 1e8e0ad commit 7842b04
Show file tree
Hide file tree
Showing 5 changed files with 141 additions and 44 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,8 @@ int filterCellNeighboursHandler(std::vector<int>&,
void processNeighboursHandler(const int iteration,
const int startLayer,
const int startLevel,
CellSeed** currentCellSeeds,
CellSeed** allCellSeeds,
CellSeed* currentCellSeeds,
const unsigned int nCurrentCells,
// const int* currentCellIds,
// const unsigned int nCurrentCellsIds,
Expand Down
6 changes: 3 additions & 3 deletions Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -91,13 +91,12 @@ template <int nLayers>
void TrackerTraitsGPU<nLayers>::computeTrackletsHybrid(const int iteration, int iROFslice, int iVertex)
{
auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance();
// TrackerTraits::computeLayerTracklets(iteration, iROFslice, iVertex);
mTimeFrameGPU->createTrackletsLUTDevice(iteration);

const Vertex diamondVert({mTrkParams[iteration].Diamond[0], mTrkParams[iteration].Diamond[1], mTrkParams[iteration].Diamond[2]}, {25.e-6f, 0.f, 0.f, 25.e-6f, 0.f, 36.f}, 1, 1.f);
gsl::span<const Vertex> diamondSpan(&diamondVert, 1);
int startROF{mTrkParams[iteration].nROFsPerIterations > 0 ? iROFslice * mTrkParams[iteration].nROFsPerIterations : 0};
int endROF{mTrkParams[iteration].nROFsPerIterations > 0 ? (iROFslice + 1) * mTrkParams[iteration].nROFsPerIterations + mTrkParams[iteration].DeltaROF : mTimeFrameGPU->getNrof()};
int endROF{o2::gpu::CAMath::Min(mTrkParams[iteration].nROFsPerIterations > 0 ? (iROFslice + 1) * mTrkParams[iteration].nROFsPerIterations + mTrkParams[iteration].DeltaROF : mTimeFrameGPU->getNrof(), mTimeFrameGPU->getNrof())};

countTrackletsInROFsHandler<nLayers>(mTimeFrameGPU->getDeviceIndexTableUtils(),
mTimeFrameGPU->getDeviceMultCutMask(),
Expand Down Expand Up @@ -284,6 +283,7 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
startLayer,
startLevel,
mTimeFrameGPU->getDeviceArrayCells(),
mTimeFrameGPU->getDeviceCells()[startLayer],
mTimeFrameGPU->getNCells()[startLayer],
mTimeFrameGPU->getDeviceArrayUsedClusters(),
mTimeFrameGPU->getDeviceNeighbours(startLayer - 1),
Expand Down Expand Up @@ -384,8 +384,8 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
}
mTimeFrameGPU->getTracks(std::min(rofs[0], rofs[1])).emplace_back(track);
}
mTimeFrameGPU->loadUsedClustersDevice();
}
mTimeFrameGPU->loadUsedClustersDevice();
if (iteration == mTrkParams.size() - 1) {
mTimeFrameGPU->unregisterHostMemory(0);
}
Expand Down
149 changes: 112 additions & 37 deletions Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -192,9 +192,6 @@ GPUd() o2::track::TrackParCov buildTrackSeed(const Cluster& cluster1,
0.f, 0.f, 0.f, 0.f, sg2q2pt});
}

// auto sort_tracklets = [] GPUhdni()(const Tracklet& a, const Tracklet& b) { return a.firstClusterIndex < b.firstClusterIndex || (a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex < b.secondClusterIndex); };
// auto equal_tracklets = [] GPUhdni()(const Tracklet& a, const Tracklet& b) { return a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex == b.secondClusterIndex; };

struct sort_tracklets {
GPUhd() bool operator()(const Tracklet& a, const Tracklet& b) { return a.firstClusterIndex < b.firstClusterIndex || (a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex < b.secondClusterIndex); }
};
Expand Down Expand Up @@ -584,14 +581,13 @@ GPUg() void compileTrackletsLookupTableKernel(const Tracklet* tracklets,
}
}

template <bool dryRun, int nLayers = 7>
GPUg() void processNeighboursKernel(const int iteration,
const int layer,
template <bool dryRun, bool debug = false, int nLayers = 7>
GPUg() void processNeighboursKernel(const int layer,
const int level,
CellSeed** currentCellSeeds,
const unsigned int nCurrentCells,
CellSeed** allCellSeeds,
CellSeed* currentCellSeeds,
const int* currentCellIds,
// const unsigned int nCurrentCellsIds,
const unsigned int nCurrentCells,
CellSeed* updatedCellSeeds,
int* updatedCellsIds,
int* foundSeedsTable, // auxiliary only in GPU code to compute the number of cells per iteration
Expand All @@ -609,7 +605,12 @@ GPUg() void processNeighboursKernel(const int iteration,
constexpr float layerxX0[7] = {5.e-3f, 5.e-3f, 5.e-3f, 1.e-2f, 1.e-2f, 1.e-2f, 1.e-2f}; // Hardcoded here for the moment.
for (unsigned int iCurrentCell = blockIdx.x * blockDim.x + threadIdx.x; iCurrentCell < nCurrentCells; iCurrentCell += blockDim.x * gridDim.x) {
int foundSeeds{0};
const auto& currentCell{currentCellSeeds[layer][iCurrentCell]};
const auto& currentCell{currentCellSeeds[iCurrentCell]};
if constexpr (debug) {
if (threadIdx.x == 0 && currentCellIds != nullptr) {
currentCellSeeds[iCurrentCell].printCell();
}
}
if (currentCell.getLevel() != level) {
continue;
}
Expand All @@ -619,12 +620,17 @@ GPUg() void processNeighboursKernel(const int iteration,
continue;
}
const int cellId = currentCellIds == nullptr ? iCurrentCell : currentCellIds[iCurrentCell];
// if constexpr (debug) {
// if (threadIdx.x == 0 && currentCellIds != nullptr) {
// printf("-> ic: %d cellId: %d \n", iCurrentCell, cellId);
// }
// }
const int startNeighbourId{cellId ? neighboursLUT[cellId - 1] : 0};
const int endNeighbourId{neighboursLUT[cellId]};

for (int iNeighbourCell{startNeighbourId}; iNeighbourCell < endNeighbourId; ++iNeighbourCell) {
const int neighbourCellId = neighbours[iNeighbourCell];
const CellSeed& neighbourCell = currentCellSeeds[layer - 1][neighbourCellId];
const CellSeed& neighbourCell = allCellSeeds[layer - 1][neighbourCellId];
if (neighbourCell.getSecondTrackletIndex() != currentCell.getFirstTrackletIndex()) {
continue;
}
Expand Down Expand Up @@ -684,6 +690,7 @@ GPUd() void pPointer(T* ptr)
{
printf("[%p]\t", ptr);
}

template <typename... Args>
GPUg() void printPointersKernel(std::tuple<Args...> args)
{
Expand Down Expand Up @@ -791,6 +798,15 @@ GPUg() void printTrackletsLUTPerROF(const int layerId,
}
}
}

GPUg() void printCellSeeds(CellSeed* seed, int nCells, const unsigned int tId = 0)
{
for (unsigned int iCell{0}; iCell < nCells; ++iCell) {
if (threadIdx.x == tId) {
seed[iCell].printCell();
}
}
}
} // namespace gpu

template <int nLayers>
Expand Down Expand Up @@ -1163,7 +1179,8 @@ int filterCellNeighboursHandler(std::vector<int>& neighHost, // TODO: eventually
void processNeighboursHandler(const int iteration,
const int startLayer,
const int startLevel,
CellSeed** currentCellSeeds,
CellSeed** allCellSeeds,
CellSeed* currentCellSeeds,
const unsigned int nCurrentCells,
// const int* currentCellIds,
// const unsigned int nCurrentCellsIds,
Expand All @@ -1181,12 +1198,12 @@ void processNeighboursHandler(const int iteration,
const int nThreads)
{
thrust::device_vector<int> foundSeedsTable(nCurrentCells + 1); // Shortcut: device_vector skips central memory management, we are relying on the contingency. TODO: fix this.
gpu::processNeighboursKernel<true><<<nBlocks, nThreads>>>(iteration,
startLayer,
gpu::processNeighboursKernel<true><<<nBlocks, nThreads>>>(startLayer,
startLevel,
allCellSeeds,
currentCellSeeds,
nCurrentCells,
nullptr, // currentCellIds,
nCurrentCells,
nullptr, // updatedCellSeeds,
nullptr, // updatedCellsIds,
thrust::raw_pointer_cast(&foundSeedsTable[0]), // auxiliary only in GPU code to compute the number of cells per iteration
Expand Down Expand Up @@ -1217,30 +1234,88 @@ void processNeighboursHandler(const int iteration,
thrust::device_vector<int> updatedCellIds(foundSeedsTable.back()), lastCellIds(foundSeedsTable.back());
thrust::device_vector<CellSeed> updatedCellSeeds(foundSeedsTable.back()), lastCellSeeds(foundSeedsTable.back());
gpu::processNeighboursKernel<false><<<1, 1>>>(iteration,
startLayer,
startLevel,
currentCellSeeds,
nCurrentCells,
nullptr, // currentCellIds,
thrust::raw_pointer_cast(&updatedCellSeeds[0]), // updatedCellSeeds
thrust::raw_pointer_cast(&updatedCellIds[0]), // updatedCellsIds
thrust::raw_pointer_cast(&foundSeedsTable[0]), // auxiliary only in GPU code to compute the number of cells per iteration
usedClusters, // Used clusters
neighbours,
neighboursLUT,
foundTrackingFrameInfo,
bz,
MaxChi2ClusterAttachment,
propagator,
matCorrType);
gpu::processNeighboursKernel<false><<<nBlocks, nThreads>>>(startLayer,
startLevel,
allCellSeeds,
currentCellSeeds,
nullptr, // currentCellIds,
nCurrentCells,
thrust::raw_pointer_cast(&updatedCellSeeds[0]), // updatedCellSeeds
thrust::raw_pointer_cast(&updatedCellIds[0]), // updatedCellsIds
thrust::raw_pointer_cast(&foundSeedsTable[0]), // auxiliary only in GPU code to compute the number of cells per iteration
usedClusters, // Used clusters
neighbours,
neighboursLUT,
foundTrackingFrameInfo,
bz,
MaxChi2ClusterAttachment,
propagator,
matCorrType);
LOGP(info, "1: updatedCellIds {} - updatedCellSeeds {}", updatedCellIds.size(), updatedCellSeeds.size());
// gpu::printCellSeeds<<<1, 1>>>(thrust::raw_pointer_cast(&updatedCellSeeds[0]), foundSeedsTable.back());
// gpu::printBufferLayerOnThread<<<1, 1>>>(0, thrust::raw_pointer_cast(&updatedCellIds[0]), updatedCellIds.size());
int level = startLevel;
for (int iLayer{startLayer - 1}; iLayer > 0 && level > 2; --iLayer) {
lastCellSeeds.swap(updatedCellSeeds);
lastCellIds.swap(updatedCellIds);
// std::vector<CellSeed>().swap(updatedCellSeeds); /// tame the memory peaks
// updatedCellId.clear();
// processNeighbours(iLayer, --level, lastCellSeed, lastCellId, updatedCellSeed, updatedCellId);
LOGP(info, "iLayer: {} level {}", iLayer, level);
--level;
// lastCellSeeds.swap(updatedCellSeeds);
// lastCellIds.swap(updatedCellIds);
// // gpu::printCellSeeds<<<1, 1>>>(thrust::raw_pointer_cast(&lastCellSeeds[0]), lastCellSeeds.size());
// foundSeedsTable.resize(lastCellSeeds.size() + 1);
// thrust::fill(foundSeedsTable.begin(), foundSeedsTable.end(), 0);
// gpu::processNeighboursKernel<true, true><<<1, 1>>>(iLayer,
// level,
// allCellSeeds,
// thrust::raw_pointer_cast(&lastCellSeeds[0]),
// thrust::raw_pointer_cast(&lastCellIds[0]), // currentCellIds,
// lastCellSeeds.size(),
// nullptr, // updatedCellSeeds,
// nullptr, // updatedCellsIds,
// thrust::raw_pointer_cast(&foundSeedsTable[0]), // auxiliary only in GPU code to compute the number of cells per iteration
// usedClusters, // Used clusters
// neighbours,
// neighboursLUT,
// foundTrackingFrameInfo,
// bz,
// MaxChi2ClusterAttachment,
// propagator,
// matCorrType);
// gpu::printBufferLayerOnThread<<<1, 1>>>(0, thrust::raw_pointer_cast(&foundSeedsTable[0]), foundSeedsTable.size());
// gpuCheckError(cudaPeekAtLastError());
// gpuCheckError(cudaDeviceSynchronize());
// gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage_2, // d_temp_storage
// temp_storage_bytes, // temp_storage_bytes
// thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_in
// thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_out
// lastCellSeeds.size() + 1, // num_items
// 0));
// discardResult(cudaMalloc(&d_temp_storage, temp_storage_bytes));
// gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage_2, // d_temp_storage
// temp_storage_bytes_2, // temp_storage_bytes
// thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_in
// thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_out
// lastCellSeeds.size() + 1, // num_items
// 0));
// updatedCellIds.resize(foundSeedsTable.back(), 0);
// updatedCellSeeds.resize(foundSeedsTable.back(), CellSeed());
// gpu::processNeighboursKernel<false><<<nBlocks, nThreads>>>(iLayer,
// level,
// allCellSeeds,
// thrust::raw_pointer_cast(&lastCellSeeds[0]),
// thrust::raw_pointer_cast(&lastCellIds[0]), // currentCellIds,
// lastCellSeeds.size(),
// thrust::raw_pointer_cast(&updatedCellSeeds[0]), // updatedCellSeeds
// thrust::raw_pointer_cast(&updatedCellIds[0]), // updatedCellsIds
// thrust::raw_pointer_cast(&foundSeedsTable[0]), // auxiliary only in GPU code to compute the number of cells per iteration
// usedClusters, // Used clusters
// neighbours,
// neighboursLUT,
// foundTrackingFrameInfo,
// bz,
// MaxChi2ClusterAttachment,
// propagator,
// matCorrType);
// LOGP(info, " -> 2: updatedCellIds {} - updatedCellSeeds {}", updatedCellIds.size(), updatedCellSeeds.size());
}
gpuCheckError(cudaFree(d_temp_storage));
Expand Down
8 changes: 7 additions & 1 deletion Detectors/ITSMFT/ITS/tracking/include/ITStracking/Cell.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

#ifndef TRACKINGITSU_INCLUDE_CACELL_H_
#define TRACKINGITSU_INCLUDE_CACELL_H_

#include <iostream>
#ifndef GPUCA_GPUCODE_DEVICE
#include <array>
#include <vector>
Expand Down Expand Up @@ -104,6 +104,7 @@ class CellSeed final : public o2::track::TrackParCovF
GPUhd() int* getLevelPtr() { return &mLevel; }
GPUhd() int* getClusters() { return mClusters; }
GPUhd() int getCluster(int i) const { return mClusters[i]; }
GPUhdi() void printCell();

private:
int mClusters[7] = {-1, -1, -1, -1, -1, -1, -1};
Expand All @@ -112,6 +113,11 @@ class CellSeed final : public o2::track::TrackParCovF
float mChi2 = 0.f;
};

GPUhdi() void CellSeed::printCell()
{
printf("trkl: %d, %d\t lvl: %d\t chi2: %f\n", mTracklets[0], mTracklets[1], mLevel, mChi2);
}

} // namespace its
} // namespace o2
#endif /* TRACKINGITSU_INCLUDE_CACELL_H_ */
Loading

0 comments on commit 7842b04

Please sign in to comment.