diff --git a/torchrec/inference/include/torchrec/inference/SingleGPUExecutor.h b/torchrec/inference/include/torchrec/inference/SingleGPUExecutor.h index 8b9e910f5..9da63d7c2 100644 --- a/torchrec/inference/include/torchrec/inference/SingleGPUExecutor.h +++ b/torchrec/inference/include/torchrec/inference/SingleGPUExecutor.h @@ -34,7 +34,8 @@ class SingleGPUExecutor { std::shared_ptr observer = std::make_shared(), c10::Device resultDevice = c10::kCPU, - size_t numProcessThreads = 1u); + size_t numProcessThreads = 1u, + bool useHighPriCudaStream = false); // Moveable only SingleGPUExecutor(SingleGPUExecutor&& executor) noexcept = default; @@ -50,6 +51,7 @@ class SingleGPUExecutor { const ExecInfos execInfos_; const size_t numGpu_; const size_t numProcessThreads_; + const bool useHighPriCudaStream_; const c10::Device resultDevice_; std::shared_ptr observer_; folly::MPMCQueue> requests_; diff --git a/torchrec/inference/src/SingleGPUExecutor.cpp b/torchrec/inference/src/SingleGPUExecutor.cpp index 83b502741..38da5527f 100644 --- a/torchrec/inference/src/SingleGPUExecutor.cpp +++ b/torchrec/inference/src/SingleGPUExecutor.cpp @@ -20,11 +20,13 @@ SingleGPUExecutor::SingleGPUExecutor( size_t numGpu, std::shared_ptr observer, c10::Device resultDevice, - size_t numProcessThreads) + size_t numProcessThreads, + bool useHighPriCudaStream) : manager_(manager), execInfos_(std::move(execInfos)), numGpu_(numGpu), numProcessThreads_(numProcessThreads), + useHighPriCudaStream_(useHighPriCudaStream), resultDevice_(resultDevice), observer_(observer), requests_(kQUEUE_CAPACITY), @@ -104,8 +106,8 @@ void SingleGPUExecutor::process() { c10::InferenceMode inferenceModeGuard; std::vector streams; for (size_t i = 0; i < numGpu_; ++i) { - streams.push_back(at::cuda::getStreamFromPool( - false /* isHighPriority */, i /* device */)); + streams.push_back( + at::cuda::getStreamFromPool(useHighPriCudaStream_, i /* device */)); } at::cuda::CUDAMultiStreamGuard streamGuard(streams);