Parallelize SingleGPUExecutor processing (#1482)

Summary: Pull Request resolved: #1482 Replacing single thread scheduling SingleGPUExecutor.process with ThreadPool. Reviewed By: palkop Differential Revision: D50983588 Privacy Context Container: L1138451 fbshipit-source-id: c024131fccb0210355dd958c950f6e31377167da
pytorch · Nov 4, 2023 · 5fce39c · 5fce39c
1 parent da3cd7d
commit 5fce39c
Show file tree

Hide file tree

Showing 2 changed files with 17 additions and 7 deletions.
diff --git a/torchrec/inference/include/torchrec/inference/SingleGPUExecutor.h b/torchrec/inference/include/torchrec/inference/SingleGPUExecutor.h
@@ -33,7 +33,8 @@ class SingleGPUExecutor {
       size_t numGpu,
       std::shared_ptr<ISingleGPUExecutorObserver> observer =
           std::make_shared<EmptySingleGPUExecutorObserver>(),
-      c10::Device resultDevice = c10::kCPU);
+      c10::Device resultDevice = c10::kCPU,
+      size_t numProcessThreads = 1u);
 
   // Moveable only
   SingleGPUExecutor(SingleGPUExecutor&& executor) noexcept = default;
@@ -48,12 +49,13 @@ class SingleGPUExecutor {
   std::shared_ptr<torch::deploy::InterpreterManager> manager_;
   const ExecInfos execInfos_;
   const size_t numGpu_;
+  const size_t numProcessThreads_;
   const c10::Device resultDevice_;
   std::shared_ptr<ISingleGPUExecutorObserver> observer_;
   folly::MPMCQueue<std::shared_ptr<PredictionBatch>> requests_;
 
+  std::unique_ptr<folly::CPUThreadPoolExecutor> processExecutor_;
   std::unique_ptr<folly::CPUThreadPoolExecutor> completionExecutor_;
   std::atomic<size_t> roundRobinExecInfoNextIdx_;
-  std::thread processThread_;
 };
 } // namespace torchrec
diff --git a/torchrec/inference/src/SingleGPUExecutor.cpp b/torchrec/inference/src/SingleGPUExecutor.cpp
@@ -19,26 +19,34 @@ SingleGPUExecutor::SingleGPUExecutor(
     ExecInfos execInfos,
     size_t numGpu,
     std::shared_ptr<ISingleGPUExecutorObserver> observer,
-    c10::Device resultDevice)
+    c10::Device resultDevice,
+    size_t numProcessThreads)
     : manager_(manager),
       execInfos_(std::move(execInfos)),
       numGpu_(numGpu),
+      numProcessThreads_(numProcessThreads),
       resultDevice_(resultDevice),
       observer_(observer),
       requests_(kQUEUE_CAPACITY),
+      processExecutor_(
+          std::make_unique<folly::CPUThreadPoolExecutor>(numProcessThreads)),
       completionExecutor_(
           std::make_unique<folly::CPUThreadPoolExecutor>(execInfos_.size())),
-      roundRobinExecInfoNextIdx_(0u),
-      processThread_([&]() { process(); }) {
+      roundRobinExecInfoNextIdx_(0u) {
+  for (size_t i = 0; i < numProcessThreads_; ++i) {
+    processExecutor_->add([&]() { process(); });
+  }
   for (const auto& exec_info : execInfos_) {
     TORCHREC_CHECK(exec_info.interpIdx < manager_->allInstances().size());
   }
   TORCHREC_CHECK(observer_);
 }
 
 SingleGPUExecutor::~SingleGPUExecutor() {
-  requests_.blockingWrite(nullptr);
-  processThread_.join();
+  for (size_t i = 0; i < numProcessThreads_; ++i) {
+    requests_.blockingWrite(nullptr);
+  }
+  processExecutor_->join();
   completionExecutor_->join();
 }