diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h index b9cfe3edd89af..ad8724f315ec8 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h @@ -73,6 +73,7 @@ class TimeFrameGPU : public TimeFrame void downloadCellsNeighboursDevice(std::vector>>&, const int); void downloadNeighboursLUTDevice(std::vector&, const int); void downloadCellsDevice(); + void downloadCellsLUTDevice(); void unregisterRest(); void initDeviceChunks(const int, const int); template diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu index 62cae30421990..f13bcf0aa5a40 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu @@ -202,11 +202,11 @@ void TimeFrameGPU::loadCellsDevice() allocMemAsync(reinterpret_cast(&mCellsDevice[iLayer]), mCells[iLayer].size() * sizeof(CellSeed), nullptr, getExtAllocator()); allocMemAsync(reinterpret_cast(&mNeighboursIndexTablesDevice[iLayer]), (mCells[iLayer].size() + 1) * sizeof(int), nullptr, getExtAllocator()); // accessory for the neigh. finding. checkGPUError(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (mCells[iLayer].size() + 1) * sizeof(int), mGpuStreams[0].get())); - checkGPUError(cudaHostRegister(mCells[iLayer].data(), mCells[iLayer].size() * sizeof(CellSeed), cudaHostRegisterPortable)); + // checkGPUError(cudaHostRegister(mCells[iLayer].data(), mCells[iLayer].size() * sizeof(CellSeed), cudaHostRegisterPortable)); checkGPUError(cudaMemcpyAsync(mCellsDevice[iLayer], mCells[iLayer].data(), mCells[iLayer].size() * sizeof(CellSeed), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } allocMemAsync(reinterpret_cast(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), nullptr, getExtAllocator()); - checkGPUError(cudaHostRegister(mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaHostRegisterPortable)); + // checkGPUError(cudaHostRegister(mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaHostRegisterPortable)); checkGPUError(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } @@ -216,12 +216,12 @@ void TimeFrameGPU::createCellsLUTDevice() { START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells LUTs"); for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) { - LOGP(debug, "gpu-transfer: creating cell LUT for {} elements on layer {}, for {} MB.", mTracklets[iLayer].size() + 1, iLayer, (mTracklets[iLayer].size() + 1) * sizeof(int) / MB); + LOGP(info, "gpu-transfer: creating cell LUT for {} elements on layer {}, for {} MB.", mTracklets[iLayer].size() + 1, iLayer, (mTracklets[iLayer].size() + 1) * sizeof(int) / MB); allocMemAsync(reinterpret_cast(&mCellsLUTDevice[iLayer]), (mTracklets[iLayer].size() + 1) * sizeof(int), nullptr, getExtAllocator()); checkGPUError(cudaMemsetAsync(mCellsLUTDevice[iLayer], 0, (mTracklets[iLayer].size() + 1) * sizeof(int), mGpuStreams[0].get())); } allocMemAsync(reinterpret_cast(&mCellsLUTDeviceArray), (nLayers - 2) * sizeof(int*), nullptr, getExtAllocator()); - checkGPUError(cudaHostRegister(mCellsLUTDevice.data(), mCellsLUTDevice.size() * sizeof(int*), cudaHostRegisterPortable)); + // checkGPUError(cudaHostRegister(mCellsLUTDevice.data(), mCellsLUTDevice.size() * sizeof(int*), cudaHostRegisterPortable)); checkGPUError(cudaMemcpyAsync(mCellsLUTDeviceArray, mCellsLUTDevice.data(), mCellsLUTDevice.size() * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } @@ -234,18 +234,32 @@ void TimeFrameGPU::createCellsBuffers(const int layer) checkGPUError(cudaMemcpyAsync(&mNCells[layer], mCellsLUTDevice[layer] + mTracklets[layer].size(), sizeof(int), cudaMemcpyDeviceToHost)); LOGP(info, "gpu-transfer: creating cell buffer for {} elements on layer {}, for {} MB.", mNCells[layer], layer, mNCells[layer] * sizeof(CellSeed) / MB); allocMemAsync(reinterpret_cast(&mCellsDevice[layer]), mNCells[layer] * sizeof(CellSeed), nullptr, getExtAllocator()); + // checkGPUError(cudaMemsetAsync(mCellsDevice[layer], 0, mNCells[layer] * sizeof(CellSeed), mGpuStreams[0].get())); STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } +template +void TimeFrameGPU::downloadCellsLUTDevice() +{ + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells buffers"); + for (auto iLayer{0}; iLayer < nLayers - 3; ++iLayer) { + LOGP(info, "gpu-transfer: downloading cells lut on layer {} for {} elements", iLayer, (mTracklets[iLayer + 1].size() + 1)); + mCellsLookupTable[iLayer].resize(mTracklets[iLayer + 1].size() + 1); + // checkGPUError(cudaHostRegister(mCellsLookupTable[iLayer].data(), (mTracklets[iLayer].size() + 1) * sizeof(int), cudaHostRegisterPortable)); + checkGPUError(cudaMemcpyAsync(mCellsLUTDevice[iLayer + 1], mCellsLookupTable[iLayer].data(), (mTracklets[iLayer + 1].size() + 1) * sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); + } + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); +} + template void TimeFrameGPU::loadCellsLUTDevice() { START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading cells LUTs"); for (auto iLayer{0}; iLayer < nLayers - 3; ++iLayer) { - LOGP(info, "gpu-transfer: loading cell LUT for {} elements on layer {}, for {} MB.", mCellsLookupTable[iLayer].size(), iLayer, mCellsLookupTable[iLayer].size() * sizeof(int) / MB); - checkGPUError(cudaHostRegister(mCellsLookupTable[iLayer].data(), mCellsLookupTable[iLayer].size() * sizeof(int), cudaHostRegisterPortable)); - checkGPUError(cudaMemcpyAsync(mCellsLUTDevice[iLayer + 1], mCellsLookupTable[iLayer].data(), mCellsLookupTable[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + LOGP(info, "gpu-transfer: loading cell LUT for {} elements on layer {}, for {} MB.", (mTracklets[iLayer].size() + 1), iLayer, (mTracklets[iLayer].size() + 1) * sizeof(int) / MB); + // checkGPUError(cudaHostRegister(mCellsLookupTable[iLayer].data(), (mTracklets[iLayer].size() + 1) * sizeof(int), cudaHostRegisterPortable)); + checkGPUError(cudaMemcpyAsync(mCellsLUTDevice[iLayer + 1], mCellsLookupTable[iLayer].data(), (mTracklets[iLayer + 1].size() + 1) * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } @@ -313,7 +327,7 @@ void TimeFrameGPU::downloadCellsDevice() LOGP(info, "gpu-transfer: downloading {} cells on layer: {}, for {} MB.", mNCells[iLayer], iLayer, mNCells[iLayer] * sizeof(CellSeed) / MB); mCells[iLayer].resize(mNCells[iLayer]); checkGPUError(cudaMemcpyAsync(mCells[iLayer].data(), mCellsDevice[iLayer], mNCells[iLayer] * sizeof(CellSeed), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); - checkGPUError(cudaHostUnregister(mCells[iLayer].data())); + // checkGPUError(cudaHostUnregister(mCells[iLayer].data())); } } @@ -352,7 +366,7 @@ void TimeFrameGPU::unregisterRest() START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "unregistering rest of the host memory"); LOGP(debug, "unregistering rest of the host memory..."); // checkGPUError(cudaHostUnregister(mCells[0].data())); - checkGPUError(cudaHostUnregister(mCellsDevice.data())); + // checkGPUError(cudaHostUnregister(mCellsDevice.data())); checkGPUError(cudaHostUnregister(mCellsLUTDevice.data())); checkGPUError(cudaHostUnregister(mTrackletsDevice.data())); checkGPUError(cudaHostUnregister(mTrackletsLUTDevice.data())); @@ -361,7 +375,7 @@ void TimeFrameGPU::unregisterRest() checkGPUError(cudaHostUnregister(mTrackletsLookupTable[iLayer].data())); } if (iLayer < nLayers - 3) { - checkGPUError(cudaHostUnregister(mCellsLookupTable[iLayer].data())); + checkGPUError(cudaHostUnregister(mCellsLookupTable[iLayer].data())); } checkGPUError(cudaHostUnregister(mTracklets[iLayer].data())); } diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx index 26ed50f62f8e3..2302fc4727be9 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx @@ -323,7 +323,7 @@ void TrackerTraitsGPU::computeCellsHybrid(const int iteration) mTimeFrameGPU->loadTrackletsLUTDevice(); mTimeFrameGPU->createCellsLUTDevice(); auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance(); - TrackerTraits::computeLayerCells(iteration); + // TrackerTraits::computeLayerCells(iteration); // for (int iLayer = 0; iLayer < mTrkParams[iteration].CellsPerRoad(); ++iLayer) { // mTimeFrame->getCells()[iLayer].clear(); @@ -374,16 +374,21 @@ void TrackerTraitsGPU::computeCellsHybrid(const int iteration) conf.nBlocks, conf.nThreads); } + mTimeFrameGPU->downloadCellsLUTDevice(); + mTimeFrameGPU->downloadCellsDevice(); } template void TrackerTraitsGPU::findCellsNeighboursHybrid(const int iteration) { mTimeFrameGPU->loadCellsDevice(); - mTimeFrameGPU->createNeighboursDevice(); + // mTimeFrameGPU->createNeighboursDevice(); mTimeFrameGPU->loadCellsLUTDevice(); auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance(); std::vector>> cellsNeighboursLayer(mTrkParams[iteration].CellsPerRoad() - 1); + for (int* p : mTimeFrameGPU->getDeviceCellLUTs()) { + std::cout << "+++++++++>" << p << std::endl; + } for (int iLayer{0}; iLayer < mTrkParams[iteration].CellsPerRoad() - 1; ++iLayer) { const int nextLayerCellsNum{static_cast(mTimeFrameGPU->getNCellsDevice()[iLayer + 1])}; mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].clear(); @@ -434,7 +439,7 @@ void TrackerTraitsGPU::findCellsNeighboursHybrid(const int iteration) mTimeFrameGPU->getDeviceNeighbours(iLayer), cellsNeighboursLayer[iLayer].size()); } - mTimeFrameGPU->downloadCellsDevice(); + mTimeFrameGPU->unregisterRest(); }; diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu index 234a65d590330..0bde0cb68cbe5 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu @@ -423,6 +423,21 @@ GPUhd() float Sq(float q) return q * q; } +template +GPUd() void pPointer(T* ptr) +{ + printf("[%p]\t", ptr); +} + +template +GPUg() void printPointersKernel(std::tuple args) +{ + auto print_all = [&](auto... ptrs) { + (pPointer(ptrs), ...); + }; + std::apply(print_all, args); +} + // Functors to sort tracklets template struct trackletSortEmptyFunctor : public thrust::binary_function { @@ -454,6 +469,32 @@ GPUg() void printBufferLayerOnThread(const int layer, const int* v, unsigned int } } +GPUg() void printMatrixRow(const int row, int** mat, const unsigned int rowLength, const int len = 150, const unsigned int tId = 0) +{ + if (blockIdx.x * blockDim.x + threadIdx.x == tId) { + for (int i{0}; i < rowLength; ++i) { + if (!(i % len)) { + printf("\n row %d: ===> %d/%d\t", row, i, (int)rowLength); + } + printf("%d\t", mat[row][i]); + } + printf("\n"); + } +} + +GPUg() void printBufferPointersLayerOnThread(const int layer, void** v, unsigned int size, const int len = 150, const unsigned int tId = 0) +{ + if (blockIdx.x * blockDim.x + threadIdx.x == tId) { + for (int i{0}; i < size; ++i) { + if (!(i % len)) { + printf("\n layer %d: ===> %d/%d\t", layer, i, (int)size); + } + printf("%p\t", (void*)v[i]); + } + printf("\n"); + } +} + // Dump vertices GPUg() void printVertices(const Vertex* v, unsigned int size, const unsigned int tId = 0) { @@ -810,7 +851,6 @@ void countCellsHandler( cellsLUTsHost, // d_out nTracklets + 1, // num_items 0)); - // gpu::printBufferLayerOnThread<<<1, 1>>>(layer, cellsLUTsHost, nTracklets + 1); gpuCheckError(cudaFree(d_temp_storage)); } @@ -846,6 +886,7 @@ void computeCellsHandler( maxChi2ClusterAttachment, // const float cellDeltaTanLambdaSigma, // const float nSigmaCut); // const float + // gpu::printBufferPointersLayerOnThread<<<1, 1>>>(0, (void**)cellsLUTsArrayDevice, 5); } void countCellNeighboursHandler(CellSeed** cellsLayersDevice, @@ -862,6 +903,7 @@ void countCellNeighboursHandler(CellSeed** cellsLayersDevice, const int nBlocks, const int nThreads) { + gpu::printMatrixRow<<<1, 1>>>(layerIndex, cellsLUTs, nCells); gpu::computeLayerCellNeighboursKernel<<>>( cellsLayersDevice, neighboursLUT,