From a7ef4133c9a1650b0eb99dca80f01d98dc3d845a Mon Sep 17 00:00:00 2001 From: Muhammed Fatih Balin Date: Fri, 2 Feb 2024 14:36:58 +0300 Subject: [PATCH 1/3] [GraphBolt][CUDA] Add GPUCache to multiGPU example. --- examples/multigpu/graphbolt/node_classification.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/examples/multigpu/graphbolt/node_classification.py b/examples/multigpu/graphbolt/node_classification.py index b9fa73353300..b5ff011bee24 100644 --- a/examples/multigpu/graphbolt/node_classification.py +++ b/examples/multigpu/graphbolt/node_classification.py @@ -284,6 +284,10 @@ def run(rank, world_size, args, devices, dataset): hidden_size = 256 out_size = num_classes + if args.gpu_cache_size > 0: + input_feature = dataset.feature._features[("node", None, "feat")] + dataset.feature._features[("node", None, "feat")] = gb.GPUCachedFeature(input_feature) + # Create GraphSAGE model. It should be copied onto a GPU as a replica. model = SAGE(in_size, hidden_size, out_size).to(device) model = DDP(model) @@ -381,6 +385,9 @@ def parse_args(): parser.add_argument( "--num-workers", type=int, default=0, help="The number of processes." ) + parser.add_argument( + "--gpu-cache-size", type=int, default=0, help="The GPU cache size for input features." + ) parser.add_argument( "--mode", default="pinned-cuda", From e54b358e49b697834a0cd1b531682123704b6873 Mon Sep 17 00:00:00 2001 From: Muhammed Fatih Balin Date: Fri, 2 Feb 2024 12:20:23 +0000 Subject: [PATCH 2/3] fix the gpu cache, fix add it to the example properly. --- examples/multigpu/graphbolt/node_classification.py | 11 ++++++++--- graphbolt/src/cuda/gpu_cache.cu | 11 +++++------ 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/examples/multigpu/graphbolt/node_classification.py b/examples/multigpu/graphbolt/node_classification.py index b5ff011bee24..4186ba4f15c9 100644 --- a/examples/multigpu/graphbolt/node_classification.py +++ b/examples/multigpu/graphbolt/node_classification.py @@ -285,8 +285,10 @@ def run(rank, world_size, args, devices, dataset): out_size = num_classes if args.gpu_cache_size > 0: - input_feature = dataset.feature._features[("node", None, "feat")] - dataset.feature._features[("node", None, "feat")] = gb.GPUCachedFeature(input_feature) + dataset.feature._features[("node", None, "feat")] = gb.GPUCachedFeature( + dataset.feature._features[("node", None, "feat")], + args.gpu_cache_size, + ) # Create GraphSAGE model. It should be copied onto a GPU as a replica. model = SAGE(in_size, hidden_size, out_size).to(device) @@ -386,7 +388,10 @@ def parse_args(): "--num-workers", type=int, default=0, help="The number of processes." ) parser.add_argument( - "--gpu-cache-size", type=int, default=0, help="The GPU cache size for input features." + "--gpu-cache-size", + type=int, + default=0, + help="The GPU cache size for input features.", ) parser.add_argument( "--mode", diff --git a/graphbolt/src/cuda/gpu_cache.cu b/graphbolt/src/cuda/gpu_cache.cu index 0a47bbbddc18..7c479fcc0c10 100644 --- a/graphbolt/src/cuda/gpu_cache.cu +++ b/graphbolt/src/cuda/gpu_cache.cu @@ -43,20 +43,19 @@ std::tuple GpuCache::Query( torch::empty(keys.size(0), keys.options().dtype(torch::kLong)); auto missing_keys = torch::empty(keys.size(0), keys.options().dtype(torch::kLong)); - cuda::CopyScalar missing_len; - auto stream = cuda::GetCurrentStream(); + auto allocator = cuda::GetAllocator(); + auto missing_len_device = allocator.AllocateStorage(1); cache_->Query( reinterpret_cast(keys.data_ptr()), keys.size(0), values.data_ptr(), reinterpret_cast(missing_index.data_ptr()), - reinterpret_cast(missing_keys.data_ptr()), missing_len.get(), - stream); + reinterpret_cast(missing_keys.data_ptr()), + missing_len_device.get(), cuda::GetCurrentStream()); values = values.view(torch::kByte) .slice(1, 0, num_bytes_) .view(dtype_) .view(shape_); - // To safely read missing_len, we synchronize - stream.synchronize(); + cuda::CopyScalar missing_len(missing_len_device.get()); missing_index = missing_index.slice(0, 0, static_cast(missing_len)); missing_keys = missing_keys.slice(0, 0, static_cast(missing_len)); return std::make_tuple(values, missing_index, missing_keys); From fcdb1eba5e383bc4e9561a84a50123a5c5d88210 Mon Sep 17 00:00:00 2001 From: Muhammed Fatih Balin Date: Fri, 2 Feb 2024 16:10:14 +0300 Subject: [PATCH 3/3] take back example change, move to another PR. --- examples/multigpu/graphbolt/node_classification.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/examples/multigpu/graphbolt/node_classification.py b/examples/multigpu/graphbolt/node_classification.py index 4186ba4f15c9..b9fa73353300 100644 --- a/examples/multigpu/graphbolt/node_classification.py +++ b/examples/multigpu/graphbolt/node_classification.py @@ -284,12 +284,6 @@ def run(rank, world_size, args, devices, dataset): hidden_size = 256 out_size = num_classes - if args.gpu_cache_size > 0: - dataset.feature._features[("node", None, "feat")] = gb.GPUCachedFeature( - dataset.feature._features[("node", None, "feat")], - args.gpu_cache_size, - ) - # Create GraphSAGE model. It should be copied onto a GPU as a replica. model = SAGE(in_size, hidden_size, out_size).to(device) model = DDP(model) @@ -387,12 +381,6 @@ def parse_args(): parser.add_argument( "--num-workers", type=int, default=0, help="The number of processes." ) - parser.add_argument( - "--gpu-cache-size", - type=int, - default=0, - help="The GPU cache size for input features.", - ) parser.add_argument( "--mode", default="pinned-cuda",