From 5a2b2aa6af07cbc9d2a7b10573f8f2ff6f5cd0a6 Mon Sep 17 00:00:00 2001
From: Dmytro Dzhulgakov <dzhulgakov@fb.com>
Date: Mon, 29 Oct 2018 13:54:47 -0700
Subject: [PATCH] Remove calls to CopyFrom that can be sync (#13205)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/13205

CopyFrom without context argument does the sync copy on the current gpu - exactly what most of the places need.

This diff kills about 60% of CopyFrom usages. Most common pattern is gpu->cpu copy with further FinishDeviceComputation - the latter can be just killed.

Reviewed By: Yangqing

Differential Revision: D11236076

fbshipit-source-id: eb790ca494dfc5d5e3a7d850b45d6f73221bb204
---
 aten/src/ATen/core/TensorImpl.h           |  2 ++
 caffe2/image/image_input_op.h             |  8 ++---
 caffe2/operators/dataset_ops.cc           |  4 +--
 caffe2/operators/enforce_finite_op.cu     |  2 +-
 caffe2/operators/lengths_tile_op.cc       |  3 +-
 caffe2/operators/lengths_tile_op.cu       |  3 +-
 caffe2/operators/operator_fallback_gpu.h  | 11 ++-----
 caffe2/operators/roi_align_op_gpu_test.cc |  2 +-
 caffe2/operators/utility_ops.cc           |  2 +-
 caffe2/operators/utility_ops.cu           |  3 +-
 caffe2/operators/utility_ops.h            |  5 ++--
 caffe2/utils/hip/math_blas_hip_test.cc    | 36 ++++++++---------------
 caffe2/utils/math_gpu_test.cc             | 20 +++++--------
 caffe2/video/video_input_op.h             |  1 -
 14 files changed, 37 insertions(+), 65 deletions(-)
diff --git a/aten/src/ATen/core/TensorImpl.h b/aten/src/ATen/core/TensorImpl.h
index adeb3cdcd8de2..1d461a2cf11bc 100644
--- a/aten/src/ATen/core/TensorImpl.h
+++ b/aten/src/ATen/core/TensorImpl.h
@@ -847,6 +847,8 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
    * preserves the DeviceType of the source tensor (so, e.g., if you allocate
    * a tensor on CPU and then CopyFrom a CUDA tensor, that will to a
    * CUDA-to-CPU transfer).
+   *
+   * If the function is invoked without `context` the copy would be synchronous
    */
   void CopyFrom(const TensorImpl& src, at::BaseContext* context = nullptr) {
     AT_ASSERT(!is_variable());
diff --git a/caffe2/image/image_input_op.h b/caffe2/image/image_input_op.h
index 7c4d2404448e8..efb3f98ea4363 100644
--- a/caffe2/image/image_input_op.h
+++ b/caffe2/image/image_input_op.h
@@ -86,7 +86,6 @@ class ImageInputOp final
 
   unique_ptr<db::DBReader> owned_reader_;
   const db::DBReader* reader_;
-  CPUContext cpu_context_;
   Tensor prefetched_image_{CPU};
   Tensor prefetched_label_{CPU};
   vector<TensorCPU> prefetched_additional_outputs_;
@@ -1208,12 +1207,13 @@ bool ImageInputOp<Context>::Prefetch() {
   // If the context is not CPUContext, we will need to do a copy in the
   // prefetch function as well.
   if (!std::is_same<Context, CPUContext>::value) {
-    prefetched_image_on_device_.CopyFrom(prefetched_image_, &cpu_context_);
-    prefetched_label_on_device_.CopyFrom(prefetched_label_, &cpu_context_);
+    // do sync copies
+    prefetched_image_on_device_.CopyFrom(prefetched_image_);
+    prefetched_label_on_device_.CopyFrom(prefetched_label_);
 
     for (int i = 0; i < prefetched_additional_outputs_on_device_.size(); ++i) {
       prefetched_additional_outputs_on_device_[i].CopyFrom(
-          prefetched_additional_outputs_[i], &cpu_context_);
+          prefetched_additional_outputs_[i]);
     }
   }
 
diff --git a/caffe2/operators/dataset_ops.cc b/caffe2/operators/dataset_ops.cc
index a69fefd7c7a19..d52d0aca8525f 100644
--- a/caffe2/operators/dataset_ops.cc
+++ b/caffe2/operators/dataset_ops.cc
@@ -961,10 +961,10 @@ class CollectTensorOp final : public Operator<Context> {
       } else if (pos >= tensorVector->size()) {
         // append
         tensorVector->emplace_back(Context::GetDeviceType());
-        tensorVector->back().CopyFrom(tensor, &context_);
+        tensorVector->back().CopyFrom(tensor); // sync copy
       } else {
         // replace
-        tensorVector->at(pos).CopyFrom(tensor, &context_);
+        tensorVector->at(pos).CopyFrom(tensor); // sync copy
       }
     }
 
diff --git a/caffe2/operators/enforce_finite_op.cu b/caffe2/operators/enforce_finite_op.cu
index 38f1669a40af3..ce5864cbfe313 100644
--- a/caffe2/operators/enforce_finite_op.cu
+++ b/caffe2/operators/enforce_finite_op.cu
@@ -7,7 +7,7 @@ namespace caffe2 {
 template <>
 template <typename T>
 bool EnforceFiniteOp<CUDAContext>::DoRunWithType() {
-  buffer_.CopyFrom(Input(0), &context_);
+  buffer_.CopyFrom(Input(0)); // sync copy
   EnforceOnCPU<T>(buffer_);
   return true;
 }
diff --git a/caffe2/operators/lengths_tile_op.cc b/caffe2/operators/lengths_tile_op.cc
index 049acaf2ff24d..c777576c1ec30 100644
--- a/caffe2/operators/lengths_tile_op.cc
+++ b/caffe2/operators/lengths_tile_op.cc
@@ -15,8 +15,7 @@ bool LengthsTileOp<CPUContext>::RunOnDevice() {
   // Context::CopyFrom and math::Sum need the same context to avoid race
   // conditions
   // why? CPUContext is not used in Sum
-  lengths_host_.CopyFrom(lengths, &context_);
-  context_.FinishDeviceComputation();
+  lengths_host_.CopyFrom(lengths); // sync copy
   auto lengths_size = lengths_host_.numel();
   auto* lengths_data = lengths_host_.data<int32_t>();
 
diff --git a/caffe2/operators/lengths_tile_op.cu b/caffe2/operators/lengths_tile_op.cu
index 1eab06ff74b6f..9a652f2bf1c98 100644
--- a/caffe2/operators/lengths_tile_op.cu
+++ b/caffe2/operators/lengths_tile_op.cu
@@ -28,8 +28,7 @@ bool LengthsTileOp<CUDAContext>::RunOnDevice() {
   CAFFE_ENFORCE_GE(data.ndim(), 1, "DATA should be at least 1-D");
   CAFFE_ENFORCE_EQ(lengths.size(), data.dim(0));
 
-  lengths_host_.CopyFrom(lengths, &context_);
-  context_.FinishDeviceComputation();
+  lengths_host_.CopyFrom(lengths); // sync copy
   auto lengths_size = lengths_host_.size();
   auto* lengths_data = lengths_host_.data<int32_t>();
 
diff --git a/caffe2/operators/operator_fallback_gpu.h b/caffe2/operators/operator_fallback_gpu.h
index 5b3a38dbfbd13..ad55ee70559de 100644
--- a/caffe2/operators/operator_fallback_gpu.h
+++ b/caffe2/operators/operator_fallback_gpu.h
@@ -62,12 +62,10 @@ class GPUFallbackOpEx final : public Operator<CUDAContext> {
   }
 
   bool RunOnDevice() override {
-    bool need_sync = false;
     for (int i = 0; i < InputSize(); ++i) {
       if (this->InputIsTensorType(i, CUDA)) {
-        BlobGetMutableTensor(local_input_blobs_[i], CPU)
-            ->CopyFrom(Input(i), &context_);
-        need_sync = true;
+        // use sync copy
+        BlobGetMutableTensor(local_input_blobs_[i], CPU)->CopyFrom(Input(i));
       } else {
         VLOG(1) << "Input " << i << " is not TensorCUDA. Skipping copy.";
         // Note(jiayq): This removes a const but conceptually
@@ -79,11 +77,6 @@ class GPUFallbackOpEx final : public Operator<CUDAContext> {
       }
     }
 
-    // Sync to make sure copies are done.
-    if (need_sync) {
-      context_.FinishDeviceComputation();
-    }
-
     if (!base_op_->Run()) {
       LOG(ERROR) << "Base op run failed in GPUFallbackOp. Def: "
                  << ProtoDebugString(this->debug_def());
diff --git a/caffe2/operators/roi_align_op_gpu_test.cc b/caffe2/operators/roi_align_op_gpu_test.cc
index 1ab49f8f826a9..506bd319f1567 100644
--- a/caffe2/operators/roi_align_op_gpu_test.cc
+++ b/caffe2/operators/roi_align_op_gpu_test.cc
@@ -190,7 +190,7 @@ void CreateAndRun(
   EXPECT_NE(nullptr, Y_blob);
 
   auto& Y = Y_blob->Get<Tensor>();
-  outResult->CopyFrom(Y, &context);
+  outResult->CopyFrom(Y);
 }
 
 } // namespace
diff --git a/caffe2/operators/utility_ops.cc b/caffe2/operators/utility_ops.cc
index 9fdea3930fdf9..d9d664649a18e 100644
--- a/caffe2/operators/utility_ops.cc
+++ b/caffe2/operators/utility_ops.cc
@@ -813,7 +813,7 @@ bool NanCheckOp<CPUContext>::RunOnDevice() {
   }
 
   if (&X != Y) {
-    Y->CopyFrom(X, &context_);
+    Y->CopyFrom(X);
   }
   return true;
 }
diff --git a/caffe2/operators/utility_ops.cu b/caffe2/operators/utility_ops.cu
index 9472f84521b1a..868e8498303e0 100644
--- a/caffe2/operators/utility_ops.cu
+++ b/caffe2/operators/utility_ops.cu
@@ -107,9 +107,8 @@ bool NanCheckOp<CUDAContext>::RunOnDevice() {
 
       {
         std::lock_guard<std::mutex> lock(CUDAContext::mutex());
-        cpu_X.CopyFrom(Input(j), &context_);
+        cpu_X.CopyFrom(Input(j)); // sync copy
       }
-      context_.FinishDeviceComputation();
       std::cerr << "Input tensor: " << j << ": [" << this->debug_def().input(j)
                 << "]" << std::endl;
       tensorPrinter_.Print<float>(cpu_X);
diff --git a/caffe2/operators/utility_ops.h b/caffe2/operators/utility_ops.h
index fe09b4238319d..aa55bbf909737 100644
--- a/caffe2/operators/utility_ops.h
+++ b/caffe2/operators/utility_ops.h
@@ -132,9 +132,8 @@ class PrintOp final : public Operator<Context> {
     if (this->InputIsTensorType(0, CPU)) {
       tensor = &this->template Input<Tensor>(0, CPU);
     } else {
-      tensor_copy_if_needed.CopyFrom(Input(0), &context_);
-      // Make sure that the copy is finished.
-      context_.FinishDeviceComputation();
+      // sync copy
+      tensor_copy_if_needed.CopyFrom(Input(0));
       tensor = &tensor_copy_if_needed;
     }
     tensor_printer_.Print<T>(*tensor);
diff --git a/caffe2/utils/hip/math_blas_hip_test.cc b/caffe2/utils/hip/math_blas_hip_test.cc
index a5df5900ee23a..675a50200a268 100644
--- a/caffe2/utils/hip/math_blas_hip_test.cc
+++ b/caffe2/utils/hip/math_blas_hip_test.cc
@@ -60,8 +60,7 @@ TEST(MathROCBLASTest, GemmNoTransNoTrans) {
       tensorY->mutable_data<float>(),
       &context);
   context.FinishDeviceComputation();
-  tensorY_host->CopyFrom(*tensorY, &context);
-  context.FinishDeviceComputation();
+  tensorY_host->CopyFrom(*tensorY);
   EXPECT_EQ(tensorY_host->size(), 30);
   for (int i = 0; i < tensorY_host->size(); ++i) {
     CHECK_EQ(tensorY_host->data<float>()[i], 10) << i;
@@ -81,8 +80,7 @@ TEST(MathROCBLASTest, GemmNoTransNoTrans) {
       tensorY->mutable_data<float>(),
       &context);
   context.FinishDeviceComputation();
-  tensorY_host->CopyFrom(*tensorY, &context);
-  context.FinishDeviceComputation();
+  tensorY_host->CopyFrom(*tensorY);
   EXPECT_EQ(tensorY_host->size(), 30);
   for (int i = 0; i < tensorY_host->size(); ++i) {
     CHECK_EQ(tensorY_host->data<float>()[i], 15) << i;
@@ -102,8 +100,7 @@ TEST(MathROCBLASTest, GemmNoTransNoTrans) {
       tensorY->mutable_data<float>(),
       &context);
   context.FinishDeviceComputation();
-  tensorY_host->CopyFrom(*tensorY, &context);
-  context.FinishDeviceComputation();
+  tensorY_host->CopyFrom(*tensorY);
   EXPECT_EQ(tensorY_host->size(), 30);
   for (int i = 0; i < tensorY_host->size(); ++i) {
     CHECK_EQ(tensorY_host->data<float>()[i], 20) << i;
@@ -160,8 +157,7 @@ TEST(MathROCBLASTest, GemmNoTransTrans) {
       tensorY->mutable_data<float>(),
       &context);
   context.FinishDeviceComputation();
-  tensorY_host->CopyFrom(*tensorY, &context);
-  context.FinishDeviceComputation();
+  tensorY_host->CopyFrom(*tensorY);
   EXPECT_EQ(tensorY_host->size(), 30);
   for (int i = 0; i < tensorY_host->size(); ++i) {
     CHECK_EQ(tensorY_host->data<float>()[i], 10) << i;
@@ -181,8 +177,7 @@ TEST(MathROCBLASTest, GemmNoTransTrans) {
       tensorY->mutable_data<float>(),
       &context);
   context.FinishDeviceComputation();
-  tensorY_host->CopyFrom(*tensorY, &context);
-  context.FinishDeviceComputation();
+  tensorY_host->CopyFrom(*tensorY);
   EXPECT_EQ(tensorY_host->size(), 30);
   for (int i = 0; i < tensorY_host->size(); ++i) {
     CHECK_EQ(tensorY_host->data<float>()[i], 15) << i;
@@ -201,8 +196,7 @@ TEST(MathROCBLASTest, GemmNoTransTrans) {
       tensorY->mutable_data<float>(),
       &context);
   context.FinishDeviceComputation();
-  tensorY_host->CopyFrom(*tensorY, &context);
-  context.FinishDeviceComputation();
+  tensorY_host->CopyFrom(*tensorY);
   EXPECT_EQ(tensorY_host->size(), 30);
   for (int i = 0; i < tensorY_host->size(); ++i) {
     CHECK_EQ(tensorY_host->data<float>()[i], 20) << i;
@@ -256,8 +250,7 @@ TEST(MathROCBLASTest, GemvNoTrans) {
       tensorY->mutable_data<float>(),
       &context);
   context.FinishDeviceComputation();
-  tensorY_host->CopyFrom(*tensorY, &context);
-  context.FinishDeviceComputation();
+  tensorY_host->CopyFrom(*tensorY);
   for (int i = 0; i < tensorY_host->size(); ++i) {
     CHECK_EQ(tensorY_host->data<float>()[i], 10) << i;
   }
@@ -274,8 +267,7 @@ TEST(MathROCBLASTest, GemvNoTrans) {
       tensorY->mutable_data<float>(),
       &context);
   context.FinishDeviceComputation();
-  tensorY_host->CopyFrom(*tensorY, &context);
-  context.FinishDeviceComputation();
+  tensorY_host->CopyFrom(*tensorY);
   for (int i = 0; i < tensorY_host->size(); ++i) {
     CHECK_EQ(tensorY_host->data<float>()[i], 15) << i;
   }
@@ -292,8 +284,7 @@ TEST(MathROCBLASTest, GemvNoTrans) {
       tensorY->mutable_data<float>(),
       &context);
   context.FinishDeviceComputation();
-  tensorY_host->CopyFrom(*tensorY, &context);
-  context.FinishDeviceComputation();
+  tensorY_host->CopyFrom(*tensorY);
   for (int i = 0; i < tensorY_host->size(); ++i) {
     CHECK_EQ(tensorY_host->data<float>()[i], 20) << i;
   }
@@ -346,8 +337,7 @@ TEST(MathROCBLASTest, GemvTrans) {
       tensorY->mutable_data<float>(),
       &context);
   context.FinishDeviceComputation();
-  tensorY_host->CopyFrom(*tensorY, &context);
-  context.FinishDeviceComputation();
+  tensorY_host->CopyFrom(*tensorY);
   for (int i = 0; i < tensorY_host->size(); ++i) {
     CHECK_EQ(tensorY_host->data<float>()[i], 6) << i;
   }
@@ -364,8 +354,7 @@ TEST(MathROCBLASTest, GemvTrans) {
       tensorY->mutable_data<float>(),
       &context);
   context.FinishDeviceComputation();
-  tensorY_host->CopyFrom(*tensorY, &context);
-  context.FinishDeviceComputation();
+  tensorY_host->CopyFrom(*tensorY);
   for (int i = 0; i < tensorY_host->size(); ++i) {
     CHECK_EQ(tensorY_host->data<float>()[i], 9) << i;
   }
@@ -382,8 +371,7 @@ TEST(MathROCBLASTest, GemvTrans) {
       tensorY->mutable_data<float>(),
       &context);
   context.FinishDeviceComputation();
-  tensorY_host->CopyFrom(*tensorY, &context);
-  context.FinishDeviceComputation();
+  tensorY_host->CopyFrom(*tensorY);
   for (int i = 0; i < tensorY_host->size(); ++i) {
     CHECK_EQ(tensorY_host->data<float>()[i], 12) << i;
   }
diff --git a/caffe2/utils/math_gpu_test.cc b/caffe2/utils/math_gpu_test.cc
index 6f30edabf2ec9..a6d0422435a70 100644
--- a/caffe2/utils/math_gpu_test.cc
+++ b/caffe2/utils/math_gpu_test.cc
@@ -72,8 +72,7 @@ void executeGpuBinaryOpTest(
 
   // Copy result to CPU so we can inspect it
   auto* tensory_host = BlobGetMutableTensor(bloby_host, CPU);
-  tensory_host->CopyFrom(*tensory, &context);
-  context.FinishDeviceComputation();
+  tensory_host->CopyFrom(*tensory);
 
   for (int i = 0; i < shapey; ++i) {
     EXPECT_EQ(tensory_host->data<float>()[i], correct_output(i));
@@ -126,8 +125,7 @@ TEST(MathUtilGPUTest, testAddStripedBatch) {
 
   // Copy result to CPU so we can inspect it
   auto* tensory_host = BlobGetMutableTensor(bloby_host, CPU);
-  tensory_host->CopyFrom(*tensory, &context);
-  context.FinishDeviceComputation();
+  tensory_host->CopyFrom(*tensory);
 
   for (int k = 0; k < 33; k++) {
     for (int i = 0; i < 25; i++) {
@@ -403,8 +401,7 @@ class ReduceTensorGPUTest : public testing::Test {
   void VerifyResult(const std::vector<float>& expected_output) {
     Blob* blob_y_host = ws_.CreateBlob("Y_host");
     auto* Y_host = BlobGetMutableTensor(blob_y_host, CPU);
-    Y_host->CopyFrom(*Y_, cuda_context_.get());
-    cuda_context_->FinishDeviceComputation();
+    Y_host->CopyFrom(*Y_);
     ASSERT_EQ(expected_output.size(), Y_host->size());
     for (std::size_t i = 0; i < expected_output.size(); ++i) {
       EXPECT_FLOAT_EQ(expected_output[i], Y_host->data<float>()[i]);
@@ -682,8 +679,7 @@ class BroadcastGPUTest : public testing::Test {
   void VerifyResult(const std::vector<float>& expected_output) {
     Blob* blob_y_host = ws_.CreateBlob("Y_host");
     auto* Y_host = BlobGetMutableTensor(blob_y_host, CPU);
-    Y_host->CopyFrom(*Y_, cuda_context_.get());
-    cuda_context_->FinishDeviceComputation();
+    Y_host->CopyFrom(*Y_);
     ASSERT_EQ(expected_output.size(), Y_host->size());
     for (std::size_t i = 0; i < expected_output.size(); ++i) {
       EXPECT_FLOAT_EQ(expected_output[i], Y_host->data<float>()[i]);
@@ -767,11 +763,10 @@ class MomentsGPUTest : public testing::Test {
       const std::vector<float>& variance_data) {
     Blob* blob_mean_host = ws_.CreateBlob("mean_host");
     auto* mean_host = BlobGetMutableTensor(blob_mean_host, CPU);
-    mean_host->CopyFrom(*mean_, cuda_context_.get());
+    mean_host->CopyFrom(*mean_);
     Blob* blob_variance_host = ws_.CreateBlob("variance_host");
     auto* variance_host = BlobGetMutableTensor(blob_variance_host, CPU);
-    variance_host->CopyFrom(*variance_, cuda_context_.get());
-    cuda_context_->FinishDeviceComputation();
+    variance_host->CopyFrom(*variance_);
 
     ASSERT_EQ(mean_data.size(), mean_host->size());
     for (std::size_t i = 0; i < mean_data.size(); ++i) {
@@ -891,8 +886,7 @@ class TransposeGPUTest : public testing::Test {
   void VerifyResult(const std::vector<float>& expected_output) {
     Blob* blob_y_host = ws_.CreateBlob("Y_host");
     auto* Y_host = BlobGetMutableTensor(blob_y_host, CPU);
-    Y_host->CopyFrom(*Y_, cuda_context_.get());
-    cuda_context_->FinishDeviceComputation();
+    Y_host->CopyFrom(*Y_);
     ASSERT_EQ(expected_output.size(), Y_host->size());
     for (std::size_t i = 0; i < expected_output.size(); ++i) {
       EXPECT_FLOAT_EQ(expected_output[i], Y_host->data<float>()[i]);
diff --git a/caffe2/video/video_input_op.h b/caffe2/video/video_input_op.h
index e58ece491e223..b2dffc2e4124f 100644
--- a/caffe2/video/video_input_op.h
+++ b/caffe2/video/video_input_op.h
@@ -51,7 +51,6 @@ class VideoInputOp final : public PrefetchOperator<Context> {
       std::bernoulli_distribution* mirror_this_clip);
 
   const db::DBReader* reader_;
-  CPUContext cpu_context_;
   Tensor prefetched_clip_rgb_{CPU};
   Tensor prefetched_clip_of_{CPU};
   Tensor prefetched_label_{CPU};