From a7ef4133c9a1650b0eb99dca80f01d98dc3d845a Mon Sep 17 00:00:00 2001
From: Muhammed Fatih Balin <m.f.balin@gmail.com>
Date: Fri, 2 Feb 2024 14:36:58 +0300
Subject: [PATCH 1/3] [GraphBolt][CUDA] Add GPUCache to multiGPU example.

---
 examples/multigpu/graphbolt/node_classification.py | 7 +++++++
 1 file changed, 7 insertions(+)
diff --git a/examples/multigpu/graphbolt/node_classification.py b/examples/multigpu/graphbolt/node_classification.py
index b9fa73353300..b5ff011bee24 100644
--- a/examples/multigpu/graphbolt/node_classification.py
+++ b/examples/multigpu/graphbolt/node_classification.py
@@ -284,6 +284,10 @@ def run(rank, world_size, args, devices, dataset):
     hidden_size = 256
     out_size = num_classes
 
+    if args.gpu_cache_size > 0:
+        input_feature = dataset.feature._features[("node", None, "feat")]
+        dataset.feature._features[("node", None, "feat")] = gb.GPUCachedFeature(input_feature)
+
     # Create GraphSAGE model. It should be copied onto a GPU as a replica.
     model = SAGE(in_size, hidden_size, out_size).to(device)
     model = DDP(model)
@@ -381,6 +385,9 @@ def parse_args():
     parser.add_argument(
         "--num-workers", type=int, default=0, help="The number of processes."
     )
+    parser.add_argument(
+        "--gpu-cache-size", type=int, default=0, help="The GPU cache size for input features."
+    )
     parser.add_argument(
         "--mode",
         default="pinned-cuda",

From e54b358e49b697834a0cd1b531682123704b6873 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih Balin <m.f.balin@gmail.com>
Date: Fri, 2 Feb 2024 12:20:23 +0000
Subject: [PATCH 2/3] fix the gpu cache, fix add it to the example properly.

---
 examples/multigpu/graphbolt/node_classification.py | 11 ++++++++---
 graphbolt/src/cuda/gpu_cache.cu                    | 11 +++++------
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/examples/multigpu/graphbolt/node_classification.py b/examples/multigpu/graphbolt/node_classification.py
index b5ff011bee24..4186ba4f15c9 100644
--- a/examples/multigpu/graphbolt/node_classification.py
+++ b/examples/multigpu/graphbolt/node_classification.py
@@ -285,8 +285,10 @@ def run(rank, world_size, args, devices, dataset):
     out_size = num_classes
 
     if args.gpu_cache_size > 0:
-        input_feature = dataset.feature._features[("node", None, "feat")]
-        dataset.feature._features[("node", None, "feat")] = gb.GPUCachedFeature(input_feature)
+        dataset.feature._features[("node", None, "feat")] = gb.GPUCachedFeature(
+            dataset.feature._features[("node", None, "feat")],
+            args.gpu_cache_size,
+        )
 
     # Create GraphSAGE model. It should be copied onto a GPU as a replica.
     model = SAGE(in_size, hidden_size, out_size).to(device)
@@ -386,7 +388,10 @@ def parse_args():
         "--num-workers", type=int, default=0, help="The number of processes."
     )
     parser.add_argument(
-        "--gpu-cache-size", type=int, default=0, help="The GPU cache size for input features."
+        "--gpu-cache-size",
+        type=int,
+        default=0,
+        help="The GPU cache size for input features.",
     )
     parser.add_argument(
         "--mode",
diff --git a/graphbolt/src/cuda/gpu_cache.cu b/graphbolt/src/cuda/gpu_cache.cu
index 0a47bbbddc18..7c479fcc0c10 100644
--- a/graphbolt/src/cuda/gpu_cache.cu
+++ b/graphbolt/src/cuda/gpu_cache.cu
@@ -43,20 +43,19 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> GpuCache::Query(
       torch::empty(keys.size(0), keys.options().dtype(torch::kLong));
   auto missing_keys =
       torch::empty(keys.size(0), keys.options().dtype(torch::kLong));
-  cuda::CopyScalar<size_t> missing_len;
-  auto stream = cuda::GetCurrentStream();
+  auto allocator = cuda::GetAllocator();
+  auto missing_len_device = allocator.AllocateStorage<size_t>(1);
   cache_->Query(
       reinterpret_cast<const key_t *>(keys.data_ptr()), keys.size(0),
       values.data_ptr<float>(),
       reinterpret_cast<uint64_t *>(missing_index.data_ptr()),
-      reinterpret_cast<key_t *>(missing_keys.data_ptr()), missing_len.get(),
-      stream);
+      reinterpret_cast<key_t *>(missing_keys.data_ptr()),
+      missing_len_device.get(), cuda::GetCurrentStream());
   values = values.view(torch::kByte)
                .slice(1, 0, num_bytes_)
                .view(dtype_)
                .view(shape_);
-  // To safely read missing_len, we synchronize
-  stream.synchronize();
+  cuda::CopyScalar<size_t> missing_len(missing_len_device.get());
   missing_index = missing_index.slice(0, 0, static_cast<size_t>(missing_len));
   missing_keys = missing_keys.slice(0, 0, static_cast<size_t>(missing_len));
   return std::make_tuple(values, missing_index, missing_keys);

From fcdb1eba5e383bc4e9561a84a50123a5c5d88210 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih Balin <m.f.balin@gmail.com>
Date: Fri, 2 Feb 2024 16:10:14 +0300
Subject: [PATCH 3/3] take back example change, move to another PR.

---
 examples/multigpu/graphbolt/node_classification.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/examples/multigpu/graphbolt/node_classification.py b/examples/multigpu/graphbolt/node_classification.py
index 4186ba4f15c9..b9fa73353300 100644
--- a/examples/multigpu/graphbolt/node_classification.py
+++ b/examples/multigpu/graphbolt/node_classification.py
@@ -284,12 +284,6 @@ def run(rank, world_size, args, devices, dataset):
     hidden_size = 256
     out_size = num_classes
 
-    if args.gpu_cache_size > 0:
-        dataset.feature._features[("node", None, "feat")] = gb.GPUCachedFeature(
-            dataset.feature._features[("node", None, "feat")],
-            args.gpu_cache_size,
-        )
-
     # Create GraphSAGE model. It should be copied onto a GPU as a replica.
     model = SAGE(in_size, hidden_size, out_size).to(device)
     model = DDP(model)
@@ -387,12 +381,6 @@ def parse_args():
     parser.add_argument(
         "--num-workers", type=int, default=0, help="The number of processes."
     )
-    parser.add_argument(
-        "--gpu-cache-size",
-        type=int,
-        default=0,
-        help="The GPU cache size for input features.",
-    )
     parser.add_argument(
         "--mode",
         default="pinned-cuda",