From 5a2b2aa6af07cbc9d2a7b10573f8f2ff6f5cd0a6 Mon Sep 17 00:00:00 2001 From: Dmytro Dzhulgakov Date: Mon, 29 Oct 2018 13:54:47 -0700 Subject: [PATCH] Remove calls to CopyFrom that can be sync (#13205) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/13205 CopyFrom without context argument does the sync copy on the current gpu - exactly what most of the places need. This diff kills about 60% of CopyFrom usages. Most common pattern is gpu->cpu copy with further FinishDeviceComputation - the latter can be just killed. Reviewed By: Yangqing Differential Revision: D11236076 fbshipit-source-id: eb790ca494dfc5d5e3a7d850b45d6f73221bb204 --- aten/src/ATen/core/TensorImpl.h | 2 ++ caffe2/image/image_input_op.h | 8 ++--- caffe2/operators/dataset_ops.cc | 4 +-- caffe2/operators/enforce_finite_op.cu | 2 +- caffe2/operators/lengths_tile_op.cc | 3 +- caffe2/operators/lengths_tile_op.cu | 3 +- caffe2/operators/operator_fallback_gpu.h | 11 ++----- caffe2/operators/roi_align_op_gpu_test.cc | 2 +- caffe2/operators/utility_ops.cc | 2 +- caffe2/operators/utility_ops.cu | 3 +- caffe2/operators/utility_ops.h | 5 ++-- caffe2/utils/hip/math_blas_hip_test.cc | 36 ++++++++--------------- caffe2/utils/math_gpu_test.cc | 20 +++++-------- caffe2/video/video_input_op.h | 1 - 14 files changed, 37 insertions(+), 65 deletions(-) diff --git a/aten/src/ATen/core/TensorImpl.h b/aten/src/ATen/core/TensorImpl.h index adeb3cdcd8de2..1d461a2cf11bc 100644 --- a/aten/src/ATen/core/TensorImpl.h +++ b/aten/src/ATen/core/TensorImpl.h @@ -847,6 +847,8 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { * preserves the DeviceType of the source tensor (so, e.g., if you allocate * a tensor on CPU and then CopyFrom a CUDA tensor, that will to a * CUDA-to-CPU transfer). + * + * If the function is invoked without `context` the copy would be synchronous */ void CopyFrom(const TensorImpl& src, at::BaseContext* context = nullptr) { AT_ASSERT(!is_variable()); diff --git a/caffe2/image/image_input_op.h b/caffe2/image/image_input_op.h index 7c4d2404448e8..efb3f98ea4363 100644 --- a/caffe2/image/image_input_op.h +++ b/caffe2/image/image_input_op.h @@ -86,7 +86,6 @@ class ImageInputOp final unique_ptr owned_reader_; const db::DBReader* reader_; - CPUContext cpu_context_; Tensor prefetched_image_{CPU}; Tensor prefetched_label_{CPU}; vector prefetched_additional_outputs_; @@ -1208,12 +1207,13 @@ bool ImageInputOp::Prefetch() { // If the context is not CPUContext, we will need to do a copy in the // prefetch function as well. if (!std::is_same::value) { - prefetched_image_on_device_.CopyFrom(prefetched_image_, &cpu_context_); - prefetched_label_on_device_.CopyFrom(prefetched_label_, &cpu_context_); + // do sync copies + prefetched_image_on_device_.CopyFrom(prefetched_image_); + prefetched_label_on_device_.CopyFrom(prefetched_label_); for (int i = 0; i < prefetched_additional_outputs_on_device_.size(); ++i) { prefetched_additional_outputs_on_device_[i].CopyFrom( - prefetched_additional_outputs_[i], &cpu_context_); + prefetched_additional_outputs_[i]); } } diff --git a/caffe2/operators/dataset_ops.cc b/caffe2/operators/dataset_ops.cc index a69fefd7c7a19..d52d0aca8525f 100644 --- a/caffe2/operators/dataset_ops.cc +++ b/caffe2/operators/dataset_ops.cc @@ -961,10 +961,10 @@ class CollectTensorOp final : public Operator { } else if (pos >= tensorVector->size()) { // append tensorVector->emplace_back(Context::GetDeviceType()); - tensorVector->back().CopyFrom(tensor, &context_); + tensorVector->back().CopyFrom(tensor); // sync copy } else { // replace - tensorVector->at(pos).CopyFrom(tensor, &context_); + tensorVector->at(pos).CopyFrom(tensor); // sync copy } } diff --git a/caffe2/operators/enforce_finite_op.cu b/caffe2/operators/enforce_finite_op.cu index 38f1669a40af3..ce5864cbfe313 100644 --- a/caffe2/operators/enforce_finite_op.cu +++ b/caffe2/operators/enforce_finite_op.cu @@ -7,7 +7,7 @@ namespace caffe2 { template <> template bool EnforceFiniteOp::DoRunWithType() { - buffer_.CopyFrom(Input(0), &context_); + buffer_.CopyFrom(Input(0)); // sync copy EnforceOnCPU(buffer_); return true; } diff --git a/caffe2/operators/lengths_tile_op.cc b/caffe2/operators/lengths_tile_op.cc index 049acaf2ff24d..c777576c1ec30 100644 --- a/caffe2/operators/lengths_tile_op.cc +++ b/caffe2/operators/lengths_tile_op.cc @@ -15,8 +15,7 @@ bool LengthsTileOp::RunOnDevice() { // Context::CopyFrom and math::Sum need the same context to avoid race // conditions // why? CPUContext is not used in Sum - lengths_host_.CopyFrom(lengths, &context_); - context_.FinishDeviceComputation(); + lengths_host_.CopyFrom(lengths); // sync copy auto lengths_size = lengths_host_.numel(); auto* lengths_data = lengths_host_.data(); diff --git a/caffe2/operators/lengths_tile_op.cu b/caffe2/operators/lengths_tile_op.cu index 1eab06ff74b6f..9a652f2bf1c98 100644 --- a/caffe2/operators/lengths_tile_op.cu +++ b/caffe2/operators/lengths_tile_op.cu @@ -28,8 +28,7 @@ bool LengthsTileOp::RunOnDevice() { CAFFE_ENFORCE_GE(data.ndim(), 1, "DATA should be at least 1-D"); CAFFE_ENFORCE_EQ(lengths.size(), data.dim(0)); - lengths_host_.CopyFrom(lengths, &context_); - context_.FinishDeviceComputation(); + lengths_host_.CopyFrom(lengths); // sync copy auto lengths_size = lengths_host_.size(); auto* lengths_data = lengths_host_.data(); diff --git a/caffe2/operators/operator_fallback_gpu.h b/caffe2/operators/operator_fallback_gpu.h index 5b3a38dbfbd13..ad55ee70559de 100644 --- a/caffe2/operators/operator_fallback_gpu.h +++ b/caffe2/operators/operator_fallback_gpu.h @@ -62,12 +62,10 @@ class GPUFallbackOpEx final : public Operator { } bool RunOnDevice() override { - bool need_sync = false; for (int i = 0; i < InputSize(); ++i) { if (this->InputIsTensorType(i, CUDA)) { - BlobGetMutableTensor(local_input_blobs_[i], CPU) - ->CopyFrom(Input(i), &context_); - need_sync = true; + // use sync copy + BlobGetMutableTensor(local_input_blobs_[i], CPU)->CopyFrom(Input(i)); } else { VLOG(1) << "Input " << i << " is not TensorCUDA. Skipping copy."; // Note(jiayq): This removes a const but conceptually @@ -79,11 +77,6 @@ class GPUFallbackOpEx final : public Operator { } } - // Sync to make sure copies are done. - if (need_sync) { - context_.FinishDeviceComputation(); - } - if (!base_op_->Run()) { LOG(ERROR) << "Base op run failed in GPUFallbackOp. Def: " << ProtoDebugString(this->debug_def()); diff --git a/caffe2/operators/roi_align_op_gpu_test.cc b/caffe2/operators/roi_align_op_gpu_test.cc index 1ab49f8f826a9..506bd319f1567 100644 --- a/caffe2/operators/roi_align_op_gpu_test.cc +++ b/caffe2/operators/roi_align_op_gpu_test.cc @@ -190,7 +190,7 @@ void CreateAndRun( EXPECT_NE(nullptr, Y_blob); auto& Y = Y_blob->Get(); - outResult->CopyFrom(Y, &context); + outResult->CopyFrom(Y); } } // namespace diff --git a/caffe2/operators/utility_ops.cc b/caffe2/operators/utility_ops.cc index 9fdea3930fdf9..d9d664649a18e 100644 --- a/caffe2/operators/utility_ops.cc +++ b/caffe2/operators/utility_ops.cc @@ -813,7 +813,7 @@ bool NanCheckOp::RunOnDevice() { } if (&X != Y) { - Y->CopyFrom(X, &context_); + Y->CopyFrom(X); } return true; } diff --git a/caffe2/operators/utility_ops.cu b/caffe2/operators/utility_ops.cu index 9472f84521b1a..868e8498303e0 100644 --- a/caffe2/operators/utility_ops.cu +++ b/caffe2/operators/utility_ops.cu @@ -107,9 +107,8 @@ bool NanCheckOp::RunOnDevice() { { std::lock_guard lock(CUDAContext::mutex()); - cpu_X.CopyFrom(Input(j), &context_); + cpu_X.CopyFrom(Input(j)); // sync copy } - context_.FinishDeviceComputation(); std::cerr << "Input tensor: " << j << ": [" << this->debug_def().input(j) << "]" << std::endl; tensorPrinter_.Print(cpu_X); diff --git a/caffe2/operators/utility_ops.h b/caffe2/operators/utility_ops.h index fe09b4238319d..aa55bbf909737 100644 --- a/caffe2/operators/utility_ops.h +++ b/caffe2/operators/utility_ops.h @@ -132,9 +132,8 @@ class PrintOp final : public Operator { if (this->InputIsTensorType(0, CPU)) { tensor = &this->template Input(0, CPU); } else { - tensor_copy_if_needed.CopyFrom(Input(0), &context_); - // Make sure that the copy is finished. - context_.FinishDeviceComputation(); + // sync copy + tensor_copy_if_needed.CopyFrom(Input(0)); tensor = &tensor_copy_if_needed; } tensor_printer_.Print(*tensor); diff --git a/caffe2/utils/hip/math_blas_hip_test.cc b/caffe2/utils/hip/math_blas_hip_test.cc index a5df5900ee23a..675a50200a268 100644 --- a/caffe2/utils/hip/math_blas_hip_test.cc +++ b/caffe2/utils/hip/math_blas_hip_test.cc @@ -60,8 +60,7 @@ TEST(MathROCBLASTest, GemmNoTransNoTrans) { tensorY->mutable_data(), &context); context.FinishDeviceComputation(); - tensorY_host->CopyFrom(*tensorY, &context); - context.FinishDeviceComputation(); + tensorY_host->CopyFrom(*tensorY); EXPECT_EQ(tensorY_host->size(), 30); for (int i = 0; i < tensorY_host->size(); ++i) { CHECK_EQ(tensorY_host->data()[i], 10) << i; @@ -81,8 +80,7 @@ TEST(MathROCBLASTest, GemmNoTransNoTrans) { tensorY->mutable_data(), &context); context.FinishDeviceComputation(); - tensorY_host->CopyFrom(*tensorY, &context); - context.FinishDeviceComputation(); + tensorY_host->CopyFrom(*tensorY); EXPECT_EQ(tensorY_host->size(), 30); for (int i = 0; i < tensorY_host->size(); ++i) { CHECK_EQ(tensorY_host->data()[i], 15) << i; @@ -102,8 +100,7 @@ TEST(MathROCBLASTest, GemmNoTransNoTrans) { tensorY->mutable_data(), &context); context.FinishDeviceComputation(); - tensorY_host->CopyFrom(*tensorY, &context); - context.FinishDeviceComputation(); + tensorY_host->CopyFrom(*tensorY); EXPECT_EQ(tensorY_host->size(), 30); for (int i = 0; i < tensorY_host->size(); ++i) { CHECK_EQ(tensorY_host->data()[i], 20) << i; @@ -160,8 +157,7 @@ TEST(MathROCBLASTest, GemmNoTransTrans) { tensorY->mutable_data(), &context); context.FinishDeviceComputation(); - tensorY_host->CopyFrom(*tensorY, &context); - context.FinishDeviceComputation(); + tensorY_host->CopyFrom(*tensorY); EXPECT_EQ(tensorY_host->size(), 30); for (int i = 0; i < tensorY_host->size(); ++i) { CHECK_EQ(tensorY_host->data()[i], 10) << i; @@ -181,8 +177,7 @@ TEST(MathROCBLASTest, GemmNoTransTrans) { tensorY->mutable_data(), &context); context.FinishDeviceComputation(); - tensorY_host->CopyFrom(*tensorY, &context); - context.FinishDeviceComputation(); + tensorY_host->CopyFrom(*tensorY); EXPECT_EQ(tensorY_host->size(), 30); for (int i = 0; i < tensorY_host->size(); ++i) { CHECK_EQ(tensorY_host->data()[i], 15) << i; @@ -201,8 +196,7 @@ TEST(MathROCBLASTest, GemmNoTransTrans) { tensorY->mutable_data(), &context); context.FinishDeviceComputation(); - tensorY_host->CopyFrom(*tensorY, &context); - context.FinishDeviceComputation(); + tensorY_host->CopyFrom(*tensorY); EXPECT_EQ(tensorY_host->size(), 30); for (int i = 0; i < tensorY_host->size(); ++i) { CHECK_EQ(tensorY_host->data()[i], 20) << i; @@ -256,8 +250,7 @@ TEST(MathROCBLASTest, GemvNoTrans) { tensorY->mutable_data(), &context); context.FinishDeviceComputation(); - tensorY_host->CopyFrom(*tensorY, &context); - context.FinishDeviceComputation(); + tensorY_host->CopyFrom(*tensorY); for (int i = 0; i < tensorY_host->size(); ++i) { CHECK_EQ(tensorY_host->data()[i], 10) << i; } @@ -274,8 +267,7 @@ TEST(MathROCBLASTest, GemvNoTrans) { tensorY->mutable_data(), &context); context.FinishDeviceComputation(); - tensorY_host->CopyFrom(*tensorY, &context); - context.FinishDeviceComputation(); + tensorY_host->CopyFrom(*tensorY); for (int i = 0; i < tensorY_host->size(); ++i) { CHECK_EQ(tensorY_host->data()[i], 15) << i; } @@ -292,8 +284,7 @@ TEST(MathROCBLASTest, GemvNoTrans) { tensorY->mutable_data(), &context); context.FinishDeviceComputation(); - tensorY_host->CopyFrom(*tensorY, &context); - context.FinishDeviceComputation(); + tensorY_host->CopyFrom(*tensorY); for (int i = 0; i < tensorY_host->size(); ++i) { CHECK_EQ(tensorY_host->data()[i], 20) << i; } @@ -346,8 +337,7 @@ TEST(MathROCBLASTest, GemvTrans) { tensorY->mutable_data(), &context); context.FinishDeviceComputation(); - tensorY_host->CopyFrom(*tensorY, &context); - context.FinishDeviceComputation(); + tensorY_host->CopyFrom(*tensorY); for (int i = 0; i < tensorY_host->size(); ++i) { CHECK_EQ(tensorY_host->data()[i], 6) << i; } @@ -364,8 +354,7 @@ TEST(MathROCBLASTest, GemvTrans) { tensorY->mutable_data(), &context); context.FinishDeviceComputation(); - tensorY_host->CopyFrom(*tensorY, &context); - context.FinishDeviceComputation(); + tensorY_host->CopyFrom(*tensorY); for (int i = 0; i < tensorY_host->size(); ++i) { CHECK_EQ(tensorY_host->data()[i], 9) << i; } @@ -382,8 +371,7 @@ TEST(MathROCBLASTest, GemvTrans) { tensorY->mutable_data(), &context); context.FinishDeviceComputation(); - tensorY_host->CopyFrom(*tensorY, &context); - context.FinishDeviceComputation(); + tensorY_host->CopyFrom(*tensorY); for (int i = 0; i < tensorY_host->size(); ++i) { CHECK_EQ(tensorY_host->data()[i], 12) << i; } diff --git a/caffe2/utils/math_gpu_test.cc b/caffe2/utils/math_gpu_test.cc index 6f30edabf2ec9..a6d0422435a70 100644 --- a/caffe2/utils/math_gpu_test.cc +++ b/caffe2/utils/math_gpu_test.cc @@ -72,8 +72,7 @@ void executeGpuBinaryOpTest( // Copy result to CPU so we can inspect it auto* tensory_host = BlobGetMutableTensor(bloby_host, CPU); - tensory_host->CopyFrom(*tensory, &context); - context.FinishDeviceComputation(); + tensory_host->CopyFrom(*tensory); for (int i = 0; i < shapey; ++i) { EXPECT_EQ(tensory_host->data()[i], correct_output(i)); @@ -126,8 +125,7 @@ TEST(MathUtilGPUTest, testAddStripedBatch) { // Copy result to CPU so we can inspect it auto* tensory_host = BlobGetMutableTensor(bloby_host, CPU); - tensory_host->CopyFrom(*tensory, &context); - context.FinishDeviceComputation(); + tensory_host->CopyFrom(*tensory); for (int k = 0; k < 33; k++) { for (int i = 0; i < 25; i++) { @@ -403,8 +401,7 @@ class ReduceTensorGPUTest : public testing::Test { void VerifyResult(const std::vector& expected_output) { Blob* blob_y_host = ws_.CreateBlob("Y_host"); auto* Y_host = BlobGetMutableTensor(blob_y_host, CPU); - Y_host->CopyFrom(*Y_, cuda_context_.get()); - cuda_context_->FinishDeviceComputation(); + Y_host->CopyFrom(*Y_); ASSERT_EQ(expected_output.size(), Y_host->size()); for (std::size_t i = 0; i < expected_output.size(); ++i) { EXPECT_FLOAT_EQ(expected_output[i], Y_host->data()[i]); @@ -682,8 +679,7 @@ class BroadcastGPUTest : public testing::Test { void VerifyResult(const std::vector& expected_output) { Blob* blob_y_host = ws_.CreateBlob("Y_host"); auto* Y_host = BlobGetMutableTensor(blob_y_host, CPU); - Y_host->CopyFrom(*Y_, cuda_context_.get()); - cuda_context_->FinishDeviceComputation(); + Y_host->CopyFrom(*Y_); ASSERT_EQ(expected_output.size(), Y_host->size()); for (std::size_t i = 0; i < expected_output.size(); ++i) { EXPECT_FLOAT_EQ(expected_output[i], Y_host->data()[i]); @@ -767,11 +763,10 @@ class MomentsGPUTest : public testing::Test { const std::vector& variance_data) { Blob* blob_mean_host = ws_.CreateBlob("mean_host"); auto* mean_host = BlobGetMutableTensor(blob_mean_host, CPU); - mean_host->CopyFrom(*mean_, cuda_context_.get()); + mean_host->CopyFrom(*mean_); Blob* blob_variance_host = ws_.CreateBlob("variance_host"); auto* variance_host = BlobGetMutableTensor(blob_variance_host, CPU); - variance_host->CopyFrom(*variance_, cuda_context_.get()); - cuda_context_->FinishDeviceComputation(); + variance_host->CopyFrom(*variance_); ASSERT_EQ(mean_data.size(), mean_host->size()); for (std::size_t i = 0; i < mean_data.size(); ++i) { @@ -891,8 +886,7 @@ class TransposeGPUTest : public testing::Test { void VerifyResult(const std::vector& expected_output) { Blob* blob_y_host = ws_.CreateBlob("Y_host"); auto* Y_host = BlobGetMutableTensor(blob_y_host, CPU); - Y_host->CopyFrom(*Y_, cuda_context_.get()); - cuda_context_->FinishDeviceComputation(); + Y_host->CopyFrom(*Y_); ASSERT_EQ(expected_output.size(), Y_host->size()); for (std::size_t i = 0; i < expected_output.size(); ++i) { EXPECT_FLOAT_EQ(expected_output[i], Y_host->data()[i]); diff --git a/caffe2/video/video_input_op.h b/caffe2/video/video_input_op.h index e58ece491e223..b2dffc2e4124f 100644 --- a/caffe2/video/video_input_op.h +++ b/caffe2/video/video_input_op.h @@ -51,7 +51,6 @@ class VideoInputOp final : public PrefetchOperator { std::bernoulli_distribution* mirror_this_clip); const db::DBReader* reader_; - CPUContext cpu_context_; Tensor prefetched_clip_rgb_{CPU}; Tensor prefetched_clip_of_{CPU}; Tensor prefetched_label_{CPU};