Skip to content

Commit

Permalink
Parallelize SingleGPUExecutor processing (#1482)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #1482

Replacing single thread scheduling SingleGPUExecutor.process with ThreadPool.

Reviewed By: palkop

Differential Revision:
D50983588

Privacy Context Container: L1138451

fbshipit-source-id: c024131fccb0210355dd958c950f6e31377167da
  • Loading branch information
shripadt authored and facebook-github-bot committed Nov 4, 2023
1 parent da3cd7d commit 5fce39c
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ class SingleGPUExecutor {
size_t numGpu,
std::shared_ptr<ISingleGPUExecutorObserver> observer =
std::make_shared<EmptySingleGPUExecutorObserver>(),
c10::Device resultDevice = c10::kCPU);
c10::Device resultDevice = c10::kCPU,
size_t numProcessThreads = 1u);

// Moveable only
SingleGPUExecutor(SingleGPUExecutor&& executor) noexcept = default;
Expand All @@ -48,12 +49,13 @@ class SingleGPUExecutor {
std::shared_ptr<torch::deploy::InterpreterManager> manager_;
const ExecInfos execInfos_;
const size_t numGpu_;
const size_t numProcessThreads_;
const c10::Device resultDevice_;
std::shared_ptr<ISingleGPUExecutorObserver> observer_;
folly::MPMCQueue<std::shared_ptr<PredictionBatch>> requests_;

std::unique_ptr<folly::CPUThreadPoolExecutor> processExecutor_;
std::unique_ptr<folly::CPUThreadPoolExecutor> completionExecutor_;
std::atomic<size_t> roundRobinExecInfoNextIdx_;
std::thread processThread_;
};
} // namespace torchrec
18 changes: 13 additions & 5 deletions torchrec/inference/src/SingleGPUExecutor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,26 +19,34 @@ SingleGPUExecutor::SingleGPUExecutor(
ExecInfos execInfos,
size_t numGpu,
std::shared_ptr<ISingleGPUExecutorObserver> observer,
c10::Device resultDevice)
c10::Device resultDevice,
size_t numProcessThreads)
: manager_(manager),
execInfos_(std::move(execInfos)),
numGpu_(numGpu),
numProcessThreads_(numProcessThreads),
resultDevice_(resultDevice),
observer_(observer),
requests_(kQUEUE_CAPACITY),
processExecutor_(
std::make_unique<folly::CPUThreadPoolExecutor>(numProcessThreads)),
completionExecutor_(
std::make_unique<folly::CPUThreadPoolExecutor>(execInfos_.size())),
roundRobinExecInfoNextIdx_(0u),
processThread_([&]() { process(); }) {
roundRobinExecInfoNextIdx_(0u) {
for (size_t i = 0; i < numProcessThreads_; ++i) {
processExecutor_->add([&]() { process(); });
}
for (const auto& exec_info : execInfos_) {
TORCHREC_CHECK(exec_info.interpIdx < manager_->allInstances().size());
}
TORCHREC_CHECK(observer_);
}

SingleGPUExecutor::~SingleGPUExecutor() {
requests_.blockingWrite(nullptr);
processThread_.join();
for (size_t i = 0; i < numProcessThreads_; ++i) {
requests_.blockingWrite(nullptr);
}
processExecutor_->join();
completionExecutor_->join();
}

Expand Down

0 comments on commit 5fce39c

Please sign in to comment.