Skip to content

Commit

Permalink
Remove calls to CopyFrom that can be sync (pytorch#13205)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: pytorch#13205

CopyFrom without context argument does the sync copy on the current gpu - exactly what most of the places need.

This diff kills about 60% of CopyFrom usages. Most common pattern is gpu->cpu copy with further FinishDeviceComputation - the latter can be just killed.

Reviewed By: Yangqing

Differential Revision: D11236076

fbshipit-source-id: eb790ca494dfc5d5e3a7d850b45d6f73221bb204
  • Loading branch information
Dmytro Dzhulgakov authored and facebook-github-bot committed Oct 29, 2018
1 parent 8ad69a8 commit 5a2b2aa
Show file tree
Hide file tree
Showing 14 changed files with 37 additions and 65 deletions.
2 changes: 2 additions & 0 deletions aten/src/ATen/core/TensorImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -847,6 +847,8 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
* preserves the DeviceType of the source tensor (so, e.g., if you allocate
* a tensor on CPU and then CopyFrom a CUDA tensor, that will to a
* CUDA-to-CPU transfer).
*
* If the function is invoked without `context` the copy would be synchronous
*/
void CopyFrom(const TensorImpl& src, at::BaseContext* context = nullptr) {
AT_ASSERT(!is_variable());
Expand Down
8 changes: 4 additions & 4 deletions caffe2/image/image_input_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,6 @@ class ImageInputOp final

unique_ptr<db::DBReader> owned_reader_;
const db::DBReader* reader_;
CPUContext cpu_context_;
Tensor prefetched_image_{CPU};
Tensor prefetched_label_{CPU};
vector<TensorCPU> prefetched_additional_outputs_;
Expand Down Expand Up @@ -1208,12 +1207,13 @@ bool ImageInputOp<Context>::Prefetch() {
// If the context is not CPUContext, we will need to do a copy in the
// prefetch function as well.
if (!std::is_same<Context, CPUContext>::value) {
prefetched_image_on_device_.CopyFrom(prefetched_image_, &cpu_context_);
prefetched_label_on_device_.CopyFrom(prefetched_label_, &cpu_context_);
// do sync copies
prefetched_image_on_device_.CopyFrom(prefetched_image_);
prefetched_label_on_device_.CopyFrom(prefetched_label_);

for (int i = 0; i < prefetched_additional_outputs_on_device_.size(); ++i) {
prefetched_additional_outputs_on_device_[i].CopyFrom(
prefetched_additional_outputs_[i], &cpu_context_);
prefetched_additional_outputs_[i]);
}
}

Expand Down
4 changes: 2 additions & 2 deletions caffe2/operators/dataset_ops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -961,10 +961,10 @@ class CollectTensorOp final : public Operator<Context> {
} else if (pos >= tensorVector->size()) {
// append
tensorVector->emplace_back(Context::GetDeviceType());
tensorVector->back().CopyFrom(tensor, &context_);
tensorVector->back().CopyFrom(tensor); // sync copy
} else {
// replace
tensorVector->at(pos).CopyFrom(tensor, &context_);
tensorVector->at(pos).CopyFrom(tensor); // sync copy
}
}

Expand Down
2 changes: 1 addition & 1 deletion caffe2/operators/enforce_finite_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ namespace caffe2 {
template <>
template <typename T>
bool EnforceFiniteOp<CUDAContext>::DoRunWithType() {
buffer_.CopyFrom(Input(0), &context_);
buffer_.CopyFrom(Input(0)); // sync copy
EnforceOnCPU<T>(buffer_);
return true;
}
Expand Down
3 changes: 1 addition & 2 deletions caffe2/operators/lengths_tile_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@ bool LengthsTileOp<CPUContext>::RunOnDevice() {
// Context::CopyFrom and math::Sum need the same context to avoid race
// conditions
// why? CPUContext is not used in Sum
lengths_host_.CopyFrom(lengths, &context_);
context_.FinishDeviceComputation();
lengths_host_.CopyFrom(lengths); // sync copy
auto lengths_size = lengths_host_.numel();
auto* lengths_data = lengths_host_.data<int32_t>();

Expand Down
3 changes: 1 addition & 2 deletions caffe2/operators/lengths_tile_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,7 @@ bool LengthsTileOp<CUDAContext>::RunOnDevice() {
CAFFE_ENFORCE_GE(data.ndim(), 1, "DATA should be at least 1-D");
CAFFE_ENFORCE_EQ(lengths.size(), data.dim(0));

lengths_host_.CopyFrom(lengths, &context_);
context_.FinishDeviceComputation();
lengths_host_.CopyFrom(lengths); // sync copy
auto lengths_size = lengths_host_.size();
auto* lengths_data = lengths_host_.data<int32_t>();

Expand Down
11 changes: 2 additions & 9 deletions caffe2/operators/operator_fallback_gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,12 +62,10 @@ class GPUFallbackOpEx final : public Operator<CUDAContext> {
}

bool RunOnDevice() override {
bool need_sync = false;
for (int i = 0; i < InputSize(); ++i) {
if (this->InputIsTensorType(i, CUDA)) {
BlobGetMutableTensor(local_input_blobs_[i], CPU)
->CopyFrom(Input(i), &context_);
need_sync = true;
// use sync copy
BlobGetMutableTensor(local_input_blobs_[i], CPU)->CopyFrom(Input(i));
} else {
VLOG(1) << "Input " << i << " is not TensorCUDA. Skipping copy.";
// Note(jiayq): This removes a const but conceptually
Expand All @@ -79,11 +77,6 @@ class GPUFallbackOpEx final : public Operator<CUDAContext> {
}
}

// Sync to make sure copies are done.
if (need_sync) {
context_.FinishDeviceComputation();
}

if (!base_op_->Run()) {
LOG(ERROR) << "Base op run failed in GPUFallbackOp. Def: "
<< ProtoDebugString(this->debug_def());
Expand Down
2 changes: 1 addition & 1 deletion caffe2/operators/roi_align_op_gpu_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ void CreateAndRun(
EXPECT_NE(nullptr, Y_blob);

auto& Y = Y_blob->Get<Tensor>();
outResult->CopyFrom(Y, &context);
outResult->CopyFrom(Y);
}

} // namespace
Expand Down
2 changes: 1 addition & 1 deletion caffe2/operators/utility_ops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -813,7 +813,7 @@ bool NanCheckOp<CPUContext>::RunOnDevice() {
}

if (&X != Y) {
Y->CopyFrom(X, &context_);
Y->CopyFrom(X);
}
return true;
}
Expand Down
3 changes: 1 addition & 2 deletions caffe2/operators/utility_ops.cu
Original file line number Diff line number Diff line change
Expand Up @@ -107,9 +107,8 @@ bool NanCheckOp<CUDAContext>::RunOnDevice() {
{
std::lock_guard<std::mutex> lock(CUDAContext::mutex());
cpu_X.CopyFrom(Input(j), &context_);
cpu_X.CopyFrom(Input(j)); // sync copy
}
context_.FinishDeviceComputation();
std::cerr << "Input tensor: " << j << ": [" << this->debug_def().input(j)
<< "]" << std::endl;
tensorPrinter_.Print<float>(cpu_X);
Expand Down
5 changes: 2 additions & 3 deletions caffe2/operators/utility_ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -132,9 +132,8 @@ class PrintOp final : public Operator<Context> {
if (this->InputIsTensorType(0, CPU)) {
tensor = &this->template Input<Tensor>(0, CPU);
} else {
tensor_copy_if_needed.CopyFrom(Input(0), &context_);
// Make sure that the copy is finished.
context_.FinishDeviceComputation();
// sync copy
tensor_copy_if_needed.CopyFrom(Input(0));
tensor = &tensor_copy_if_needed;
}
tensor_printer_.Print<T>(*tensor);
Expand Down
36 changes: 12 additions & 24 deletions caffe2/utils/hip/math_blas_hip_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,7 @@ TEST(MathROCBLASTest, GemmNoTransNoTrans) {
tensorY->mutable_data<float>(),
&context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY, &context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY);
EXPECT_EQ(tensorY_host->size(), 30);
for (int i = 0; i < tensorY_host->size(); ++i) {
CHECK_EQ(tensorY_host->data<float>()[i], 10) << i;
Expand All @@ -81,8 +80,7 @@ TEST(MathROCBLASTest, GemmNoTransNoTrans) {
tensorY->mutable_data<float>(),
&context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY, &context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY);
EXPECT_EQ(tensorY_host->size(), 30);
for (int i = 0; i < tensorY_host->size(); ++i) {
CHECK_EQ(tensorY_host->data<float>()[i], 15) << i;
Expand All @@ -102,8 +100,7 @@ TEST(MathROCBLASTest, GemmNoTransNoTrans) {
tensorY->mutable_data<float>(),
&context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY, &context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY);
EXPECT_EQ(tensorY_host->size(), 30);
for (int i = 0; i < tensorY_host->size(); ++i) {
CHECK_EQ(tensorY_host->data<float>()[i], 20) << i;
Expand Down Expand Up @@ -160,8 +157,7 @@ TEST(MathROCBLASTest, GemmNoTransTrans) {
tensorY->mutable_data<float>(),
&context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY, &context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY);
EXPECT_EQ(tensorY_host->size(), 30);
for (int i = 0; i < tensorY_host->size(); ++i) {
CHECK_EQ(tensorY_host->data<float>()[i], 10) << i;
Expand All @@ -181,8 +177,7 @@ TEST(MathROCBLASTest, GemmNoTransTrans) {
tensorY->mutable_data<float>(),
&context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY, &context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY);
EXPECT_EQ(tensorY_host->size(), 30);
for (int i = 0; i < tensorY_host->size(); ++i) {
CHECK_EQ(tensorY_host->data<float>()[i], 15) << i;
Expand All @@ -201,8 +196,7 @@ TEST(MathROCBLASTest, GemmNoTransTrans) {
tensorY->mutable_data<float>(),
&context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY, &context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY);
EXPECT_EQ(tensorY_host->size(), 30);
for (int i = 0; i < tensorY_host->size(); ++i) {
CHECK_EQ(tensorY_host->data<float>()[i], 20) << i;
Expand Down Expand Up @@ -256,8 +250,7 @@ TEST(MathROCBLASTest, GemvNoTrans) {
tensorY->mutable_data<float>(),
&context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY, &context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY);
for (int i = 0; i < tensorY_host->size(); ++i) {
CHECK_EQ(tensorY_host->data<float>()[i], 10) << i;
}
Expand All @@ -274,8 +267,7 @@ TEST(MathROCBLASTest, GemvNoTrans) {
tensorY->mutable_data<float>(),
&context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY, &context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY);
for (int i = 0; i < tensorY_host->size(); ++i) {
CHECK_EQ(tensorY_host->data<float>()[i], 15) << i;
}
Expand All @@ -292,8 +284,7 @@ TEST(MathROCBLASTest, GemvNoTrans) {
tensorY->mutable_data<float>(),
&context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY, &context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY);
for (int i = 0; i < tensorY_host->size(); ++i) {
CHECK_EQ(tensorY_host->data<float>()[i], 20) << i;
}
Expand Down Expand Up @@ -346,8 +337,7 @@ TEST(MathROCBLASTest, GemvTrans) {
tensorY->mutable_data<float>(),
&context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY, &context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY);
for (int i = 0; i < tensorY_host->size(); ++i) {
CHECK_EQ(tensorY_host->data<float>()[i], 6) << i;
}
Expand All @@ -364,8 +354,7 @@ TEST(MathROCBLASTest, GemvTrans) {
tensorY->mutable_data<float>(),
&context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY, &context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY);
for (int i = 0; i < tensorY_host->size(); ++i) {
CHECK_EQ(tensorY_host->data<float>()[i], 9) << i;
}
Expand All @@ -382,8 +371,7 @@ TEST(MathROCBLASTest, GemvTrans) {
tensorY->mutable_data<float>(),
&context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY, &context);
context.FinishDeviceComputation();
tensorY_host->CopyFrom(*tensorY);
for (int i = 0; i < tensorY_host->size(); ++i) {
CHECK_EQ(tensorY_host->data<float>()[i], 12) << i;
}
Expand Down
20 changes: 7 additions & 13 deletions caffe2/utils/math_gpu_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,7 @@ void executeGpuBinaryOpTest(

// Copy result to CPU so we can inspect it
auto* tensory_host = BlobGetMutableTensor(bloby_host, CPU);
tensory_host->CopyFrom(*tensory, &context);
context.FinishDeviceComputation();
tensory_host->CopyFrom(*tensory);

for (int i = 0; i < shapey; ++i) {
EXPECT_EQ(tensory_host->data<float>()[i], correct_output(i));
Expand Down Expand Up @@ -126,8 +125,7 @@ TEST(MathUtilGPUTest, testAddStripedBatch) {

// Copy result to CPU so we can inspect it
auto* tensory_host = BlobGetMutableTensor(bloby_host, CPU);
tensory_host->CopyFrom(*tensory, &context);
context.FinishDeviceComputation();
tensory_host->CopyFrom(*tensory);

for (int k = 0; k < 33; k++) {
for (int i = 0; i < 25; i++) {
Expand Down Expand Up @@ -403,8 +401,7 @@ class ReduceTensorGPUTest : public testing::Test {
void VerifyResult(const std::vector<float>& expected_output) {
Blob* blob_y_host = ws_.CreateBlob("Y_host");
auto* Y_host = BlobGetMutableTensor(blob_y_host, CPU);
Y_host->CopyFrom(*Y_, cuda_context_.get());
cuda_context_->FinishDeviceComputation();
Y_host->CopyFrom(*Y_);
ASSERT_EQ(expected_output.size(), Y_host->size());
for (std::size_t i = 0; i < expected_output.size(); ++i) {
EXPECT_FLOAT_EQ(expected_output[i], Y_host->data<float>()[i]);
Expand Down Expand Up @@ -682,8 +679,7 @@ class BroadcastGPUTest : public testing::Test {
void VerifyResult(const std::vector<float>& expected_output) {
Blob* blob_y_host = ws_.CreateBlob("Y_host");
auto* Y_host = BlobGetMutableTensor(blob_y_host, CPU);
Y_host->CopyFrom(*Y_, cuda_context_.get());
cuda_context_->FinishDeviceComputation();
Y_host->CopyFrom(*Y_);
ASSERT_EQ(expected_output.size(), Y_host->size());
for (std::size_t i = 0; i < expected_output.size(); ++i) {
EXPECT_FLOAT_EQ(expected_output[i], Y_host->data<float>()[i]);
Expand Down Expand Up @@ -767,11 +763,10 @@ class MomentsGPUTest : public testing::Test {
const std::vector<float>& variance_data) {
Blob* blob_mean_host = ws_.CreateBlob("mean_host");
auto* mean_host = BlobGetMutableTensor(blob_mean_host, CPU);
mean_host->CopyFrom(*mean_, cuda_context_.get());
mean_host->CopyFrom(*mean_);
Blob* blob_variance_host = ws_.CreateBlob("variance_host");
auto* variance_host = BlobGetMutableTensor(blob_variance_host, CPU);
variance_host->CopyFrom(*variance_, cuda_context_.get());
cuda_context_->FinishDeviceComputation();
variance_host->CopyFrom(*variance_);

ASSERT_EQ(mean_data.size(), mean_host->size());
for (std::size_t i = 0; i < mean_data.size(); ++i) {
Expand Down Expand Up @@ -891,8 +886,7 @@ class TransposeGPUTest : public testing::Test {
void VerifyResult(const std::vector<float>& expected_output) {
Blob* blob_y_host = ws_.CreateBlob("Y_host");
auto* Y_host = BlobGetMutableTensor(blob_y_host, CPU);
Y_host->CopyFrom(*Y_, cuda_context_.get());
cuda_context_->FinishDeviceComputation();
Y_host->CopyFrom(*Y_);
ASSERT_EQ(expected_output.size(), Y_host->size());
for (std::size_t i = 0; i < expected_output.size(); ++i) {
EXPECT_FLOAT_EQ(expected_output[i], Y_host->data<float>()[i]);
Expand Down
1 change: 0 additions & 1 deletion caffe2/video/video_input_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ class VideoInputOp final : public PrefetchOperator<Context> {
std::bernoulli_distribution* mirror_this_clip);

const db::DBReader* reader_;
CPUContext cpu_context_;
Tensor prefetched_clip_rgb_{CPU};
Tensor prefetched_clip_of_{CPU};
Tensor prefetched_label_{CPU};
Expand Down

0 comments on commit 5a2b2aa

Please sign in to comment.