Skip to content

Commit

Permalink
Save work
Browse files Browse the repository at this point in the history
  • Loading branch information
mconcas committed Nov 5, 2024
1 parent 4f66717 commit c1998b8
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ class TimeFrameGPU : public TimeFrame
void downloadCellsNeighboursDevice(std::vector<std::vector<std::pair<int, int>>>&, const int);
void downloadNeighboursLUTDevice(std::vector<int>&, const int);
void downloadCellsDevice();
void downloadCellsLUTDevice();
void unregisterRest();
void initDeviceChunks(const int, const int);
template <Task task>
Expand Down
34 changes: 24 additions & 10 deletions Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
Original file line number Diff line number Diff line change
Expand Up @@ -202,11 +202,11 @@ void TimeFrameGPU<nLayers>::loadCellsDevice()
allocMemAsync(reinterpret_cast<void**>(&mCellsDevice[iLayer]), mCells[iLayer].size() * sizeof(CellSeed), nullptr, getExtAllocator());
allocMemAsync(reinterpret_cast<void**>(&mNeighboursIndexTablesDevice[iLayer]), (mCells[iLayer].size() + 1) * sizeof(int), nullptr, getExtAllocator()); // accessory for the neigh. finding.
checkGPUError(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (mCells[iLayer].size() + 1) * sizeof(int), mGpuStreams[0].get()));
checkGPUError(cudaHostRegister(mCells[iLayer].data(), mCells[iLayer].size() * sizeof(CellSeed), cudaHostRegisterPortable));
// checkGPUError(cudaHostRegister(mCells[iLayer].data(), mCells[iLayer].size() * sizeof(CellSeed), cudaHostRegisterPortable));
checkGPUError(cudaMemcpyAsync(mCellsDevice[iLayer], mCells[iLayer].data(), mCells[iLayer].size() * sizeof(CellSeed), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
}
allocMemAsync(reinterpret_cast<void**>(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), nullptr, getExtAllocator());
checkGPUError(cudaHostRegister(mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaHostRegisterPortable));
// checkGPUError(cudaHostRegister(mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaHostRegisterPortable));
checkGPUError(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
}
Expand All @@ -216,12 +216,12 @@ void TimeFrameGPU<nLayers>::createCellsLUTDevice()
{
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells LUTs");
for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) {
LOGP(debug, "gpu-transfer: creating cell LUT for {} elements on layer {}, for {} MB.", mTracklets[iLayer].size() + 1, iLayer, (mTracklets[iLayer].size() + 1) * sizeof(int) / MB);
LOGP(info, "gpu-transfer: creating cell LUT for {} elements on layer {}, for {} MB.", mTracklets[iLayer].size() + 1, iLayer, (mTracklets[iLayer].size() + 1) * sizeof(int) / MB);
allocMemAsync(reinterpret_cast<void**>(&mCellsLUTDevice[iLayer]), (mTracklets[iLayer].size() + 1) * sizeof(int), nullptr, getExtAllocator());
checkGPUError(cudaMemsetAsync(mCellsLUTDevice[iLayer], 0, (mTracklets[iLayer].size() + 1) * sizeof(int), mGpuStreams[0].get()));
}
allocMemAsync(reinterpret_cast<void**>(&mCellsLUTDeviceArray), (nLayers - 2) * sizeof(int*), nullptr, getExtAllocator());
checkGPUError(cudaHostRegister(mCellsLUTDevice.data(), mCellsLUTDevice.size() * sizeof(int*), cudaHostRegisterPortable));
// checkGPUError(cudaHostRegister(mCellsLUTDevice.data(), mCellsLUTDevice.size() * sizeof(int*), cudaHostRegisterPortable));
checkGPUError(cudaMemcpyAsync(mCellsLUTDeviceArray, mCellsLUTDevice.data(), mCellsLUTDevice.size() * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
}
Expand All @@ -234,18 +234,32 @@ void TimeFrameGPU<nLayers>::createCellsBuffers(const int layer)
checkGPUError(cudaMemcpyAsync(&mNCells[layer], mCellsLUTDevice[layer] + mTracklets[layer].size(), sizeof(int), cudaMemcpyDeviceToHost));
LOGP(info, "gpu-transfer: creating cell buffer for {} elements on layer {}, for {} MB.", mNCells[layer], layer, mNCells[layer] * sizeof(CellSeed) / MB);
allocMemAsync(reinterpret_cast<void**>(&mCellsDevice[layer]), mNCells[layer] * sizeof(CellSeed), nullptr, getExtAllocator());
// checkGPUError(cudaMemsetAsync(mCellsDevice[layer], 0, mNCells[layer] * sizeof(CellSeed), mGpuStreams[0].get()));

STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
}

template <int nLayers>
void TimeFrameGPU<nLayers>::downloadCellsLUTDevice()
{
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells buffers");
for (auto iLayer{0}; iLayer < nLayers - 3; ++iLayer) {
LOGP(info, "gpu-transfer: downloading cells lut on layer {} for {} elements", iLayer, (mTracklets[iLayer + 1].size() + 1));
mCellsLookupTable[iLayer].resize(mTracklets[iLayer + 1].size() + 1);
// checkGPUError(cudaHostRegister(mCellsLookupTable[iLayer].data(), (mTracklets[iLayer].size() + 1) * sizeof(int), cudaHostRegisterPortable));
checkGPUError(cudaMemcpyAsync(mCellsLUTDevice[iLayer + 1], mCellsLookupTable[iLayer].data(), (mTracklets[iLayer + 1].size() + 1) * sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
}
STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
}

template <int nLayers>
void TimeFrameGPU<nLayers>::loadCellsLUTDevice()
{
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading cells LUTs");
for (auto iLayer{0}; iLayer < nLayers - 3; ++iLayer) {
LOGP(info, "gpu-transfer: loading cell LUT for {} elements on layer {}, for {} MB.", mCellsLookupTable[iLayer].size(), iLayer, mCellsLookupTable[iLayer].size() * sizeof(int) / MB);
checkGPUError(cudaHostRegister(mCellsLookupTable[iLayer].data(), mCellsLookupTable[iLayer].size() * sizeof(int), cudaHostRegisterPortable));
checkGPUError(cudaMemcpyAsync(mCellsLUTDevice[iLayer + 1], mCellsLookupTable[iLayer].data(), mCellsLookupTable[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
LOGP(info, "gpu-transfer: loading cell LUT for {} elements on layer {}, for {} MB.", (mTracklets[iLayer].size() + 1), iLayer, (mTracklets[iLayer].size() + 1) * sizeof(int) / MB);
// checkGPUError(cudaHostRegister(mCellsLookupTable[iLayer].data(), (mTracklets[iLayer].size() + 1) * sizeof(int), cudaHostRegisterPortable));
checkGPUError(cudaMemcpyAsync(mCellsLUTDevice[iLayer + 1], mCellsLookupTable[iLayer].data(), (mTracklets[iLayer + 1].size() + 1) * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
}
STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
}
Expand Down Expand Up @@ -313,7 +327,7 @@ void TimeFrameGPU<nLayers>::downloadCellsDevice()
LOGP(info, "gpu-transfer: downloading {} cells on layer: {}, for {} MB.", mNCells[iLayer], iLayer, mNCells[iLayer] * sizeof(CellSeed) / MB);
mCells[iLayer].resize(mNCells[iLayer]);
checkGPUError(cudaMemcpyAsync(mCells[iLayer].data(), mCellsDevice[iLayer], mNCells[iLayer] * sizeof(CellSeed), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
checkGPUError(cudaHostUnregister(mCells[iLayer].data()));
// checkGPUError(cudaHostUnregister(mCells[iLayer].data()));
}
}

Expand Down Expand Up @@ -352,7 +366,7 @@ void TimeFrameGPU<nLayers>::unregisterRest()
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "unregistering rest of the host memory");
LOGP(debug, "unregistering rest of the host memory...");
// checkGPUError(cudaHostUnregister(mCells[0].data()));
checkGPUError(cudaHostUnregister(mCellsDevice.data()));
// checkGPUError(cudaHostUnregister(mCellsDevice.data()));
checkGPUError(cudaHostUnregister(mCellsLUTDevice.data()));
checkGPUError(cudaHostUnregister(mTrackletsDevice.data()));
checkGPUError(cudaHostUnregister(mTrackletsLUTDevice.data()));
Expand All @@ -361,7 +375,7 @@ void TimeFrameGPU<nLayers>::unregisterRest()
checkGPUError(cudaHostUnregister(mTrackletsLookupTable[iLayer].data()));
}
if (iLayer < nLayers - 3) {
checkGPUError(cudaHostUnregister(mCellsLookupTable[iLayer].data()));
checkGPUError(cudaHostUnregister(mCellsLookupTable[iLayer].data()));
}
checkGPUError(cudaHostUnregister(mTracklets[iLayer].data()));
}
Expand Down
11 changes: 8 additions & 3 deletions Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@ void TrackerTraitsGPU<nLayers>::computeCellsHybrid(const int iteration)
mTimeFrameGPU->loadTrackletsLUTDevice();
mTimeFrameGPU->createCellsLUTDevice();
auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance();
TrackerTraits::computeLayerCells(iteration);
// TrackerTraits::computeLayerCells(iteration);

// for (int iLayer = 0; iLayer < mTrkParams[iteration].CellsPerRoad(); ++iLayer) {
// mTimeFrame->getCells()[iLayer].clear();
Expand Down Expand Up @@ -374,16 +374,21 @@ void TrackerTraitsGPU<nLayers>::computeCellsHybrid(const int iteration)
conf.nBlocks,
conf.nThreads);
}
mTimeFrameGPU->downloadCellsLUTDevice();
mTimeFrameGPU->downloadCellsDevice();
}

template <int nLayers>
void TrackerTraitsGPU<nLayers>::findCellsNeighboursHybrid(const int iteration)
{
mTimeFrameGPU->loadCellsDevice();
mTimeFrameGPU->createNeighboursDevice();
// mTimeFrameGPU->createNeighboursDevice();
mTimeFrameGPU->loadCellsLUTDevice();
auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance();
std::vector<std::vector<std::pair<int, int>>> cellsNeighboursLayer(mTrkParams[iteration].CellsPerRoad() - 1);
for (int* p : mTimeFrameGPU->getDeviceCellLUTs()) {
std::cout << "+++++++++>" << p << std::endl;
}
for (int iLayer{0}; iLayer < mTrkParams[iteration].CellsPerRoad() - 1; ++iLayer) {
const int nextLayerCellsNum{static_cast<int>(mTimeFrameGPU->getNCellsDevice()[iLayer + 1])};
mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].clear();
Expand Down Expand Up @@ -434,7 +439,7 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighboursHybrid(const int iteration)
mTimeFrameGPU->getDeviceNeighbours(iLayer),
cellsNeighboursLayer[iLayer].size());
}
mTimeFrameGPU->downloadCellsDevice();

mTimeFrameGPU->unregisterRest();
};

Expand Down
44 changes: 43 additions & 1 deletion Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -423,6 +423,21 @@ GPUhd() float Sq(float q)
return q * q;
}

template <typename T>
GPUd() void pPointer(T* ptr)
{
printf("[%p]\t", ptr);
}

template <typename... Args>
GPUg() void printPointersKernel(std::tuple<Args...> args)
{
auto print_all = [&](auto... ptrs) {
(pPointer(ptrs), ...);
};
std::apply(print_all, args);
}

// Functors to sort tracklets
template <typename T>
struct trackletSortEmptyFunctor : public thrust::binary_function<T, T, bool> {
Expand Down Expand Up @@ -454,6 +469,32 @@ GPUg() void printBufferLayerOnThread(const int layer, const int* v, unsigned int
}
}

GPUg() void printMatrixRow(const int row, int** mat, const unsigned int rowLength, const int len = 150, const unsigned int tId = 0)
{
if (blockIdx.x * blockDim.x + threadIdx.x == tId) {
for (int i{0}; i < rowLength; ++i) {
if (!(i % len)) {
printf("\n row %d: ===> %d/%d\t", row, i, (int)rowLength);
}
printf("%d\t", mat[row][i]);
}
printf("\n");
}
}

GPUg() void printBufferPointersLayerOnThread(const int layer, void** v, unsigned int size, const int len = 150, const unsigned int tId = 0)
{
if (blockIdx.x * blockDim.x + threadIdx.x == tId) {
for (int i{0}; i < size; ++i) {
if (!(i % len)) {
printf("\n layer %d: ===> %d/%d\t", layer, i, (int)size);
}
printf("%p\t", (void*)v[i]);
}
printf("\n");
}
}

// Dump vertices
GPUg() void printVertices(const Vertex* v, unsigned int size, const unsigned int tId = 0)
{
Expand Down Expand Up @@ -810,7 +851,6 @@ void countCellsHandler(
cellsLUTsHost, // d_out
nTracklets + 1, // num_items
0));
// gpu::printBufferLayerOnThread<<<1, 1>>>(layer, cellsLUTsHost, nTracklets + 1);
gpuCheckError(cudaFree(d_temp_storage));
}

Expand Down Expand Up @@ -846,6 +886,7 @@ void computeCellsHandler(
maxChi2ClusterAttachment, // const float
cellDeltaTanLambdaSigma, // const float
nSigmaCut); // const float
// gpu::printBufferPointersLayerOnThread<<<1, 1>>>(0, (void**)cellsLUTsArrayDevice, 5);
}

void countCellNeighboursHandler(CellSeed** cellsLayersDevice,
Expand All @@ -862,6 +903,7 @@ void countCellNeighboursHandler(CellSeed** cellsLayersDevice,
const int nBlocks,
const int nThreads)
{
gpu::printMatrixRow<<<1, 1>>>(layerIndex, cellsLUTs, nCells);
gpu::computeLayerCellNeighboursKernel<true><<<nBlocks, nThreads>>>(
cellsLayersDevice,
neighboursLUT,
Expand Down

0 comments on commit c1998b8

Please sign in to comment.