Merge branch 'graphbolt_tests' of https://github.com/drivanov/dgl int…

…o graphbolt_tests
dmlc · Feb 2, 2024 · f376701 · f376701
2 parents 3e8647f + 6b99892
commit f376701
Show file tree

Hide file tree

Showing 14 changed files with 549 additions and 147 deletions.
diff --git a/docs/source/install/index.rst b/docs/source/install/index.rst
@@ -5,11 +5,13 @@ System requirements
 -------------------
 DGL works with the following operating systems:
 
-* Ubuntu 16.04
+* Ubuntu 20.04+
+* CentOS 8+
+* RHEL 8+
 * macOS X
 * Windows 10
 
-DGL requires Python version 3.6, 3.7, 3.8 or 3.9.
+DGL requires Python version 3.7, 3.8, 3.9, 3.10, 3.11.
 
 DGL supports multiple tensor libraries as backends, e.g., PyTorch, MXNet. For requirements on backends and how to select one, see :ref:`backends`.
 

diff --git a/graphbolt/src/cuda/index_select_csc_impl.cu b/graphbolt/src/cuda/index_select_csc_impl.cu
@@ -14,12 +14,13 @@
 #include <numeric>
 
 #include "./common.h"
+#include "./max_uva_threads.h"
 #include "./utils.h"
 
 namespace graphbolt {
 namespace ops {
 
-constexpr int BLOCK_SIZE = 128;
+constexpr int BLOCK_SIZE = CUDA_MAX_NUM_THREADS;
 
 // Given the in_degree array and a permutation, returns in_degree of the output
 // and the permuted and modified in_degree of the input. The modified in_degree
@@ -130,7 +131,10 @@ std::tuple<torch::Tensor, torch::Tensor> UVAIndexSelectCSCCopyIndices(
   torch::Tensor output_indices =
       torch::empty(output_size.value(), options.dtype(indices.scalar_type()));
   const dim3 block(BLOCK_SIZE);
-  const dim3 grid((edge_count_aligned + BLOCK_SIZE - 1) / BLOCK_SIZE);
+  const dim3 grid(
+      (std::min(edge_count_aligned, cuda::max_uva_threads.value_or(1 << 20)) +
+       BLOCK_SIZE - 1) /
+      BLOCK_SIZE);
 
   // Find the smallest integer type to store the coo_aligned_rows tensor.
   const int num_bits = cuda::NumberOfBits(num_nodes);

diff --git a/graphbolt/src/cuda/index_select_impl.cu b/graphbolt/src/cuda/index_select_impl.cu
@@ -131,7 +131,7 @@ torch::Tensor UVAIndexSelectImpl_(torch::Tensor input, torch::Tensor index) {
         IndexSelectSingleKernel, num_blocks, num_threads, 0, input_ptr,
         input_len, index_sorted_ptr, return_len, ret_ptr, permutation_ptr);
   } else {
-    constexpr int BLOCK_SIZE = 512;
+    constexpr int BLOCK_SIZE = CUDA_MAX_NUM_THREADS;
     dim3 block(BLOCK_SIZE, 1);
     while (static_cast<int64_t>(block.x) >= 2 * aligned_feature_size) {
       block.x >>= 1;

diff --git a/python/dgl/distributed/dist_graph.py b/python/dgl/distributed/dist_graph.py
@@ -60,18 +60,21 @@ class InitGraphRequest(rpc.Request):
     with shared memory.
     """
 
-    def __init__(self, graph_name):
+    def __init__(self, graph_name, use_graphbolt):
         self._graph_name = graph_name
+        self._use_graphbolt = use_graphbolt
 
     def __getstate__(self):
-        return self._graph_name
+        return self._graph_name, self._use_graphbolt
 
     def __setstate__(self, state):
-        self._graph_name = state
+        self._graph_name, self._use_graphbolt = state
 
     def process_request(self, server_state):
         if server_state.graph is None:
-            server_state.graph = _get_graph_from_shared_mem(self._graph_name)
+            server_state.graph = _get_graph_from_shared_mem(
+                self._graph_name, self._use_graphbolt
+            )
         return InitGraphResponse(self._graph_name)
 
 
@@ -153,13 +156,15 @@ def _exist_shared_mem_array(graph_name, name):
     return exist_shared_mem_array(_get_edata_path(graph_name, name))
 
 
-def _get_graph_from_shared_mem(graph_name):
+def _get_graph_from_shared_mem(graph_name, use_graphbolt):
     """Get the graph from the DistGraph server.
 
     The DistGraph server puts the graph structure of the local partition in the shared memory.
     The client can access the graph structure and some metadata on nodes and edges directly
     through shared memory to reduce the overhead of data access.
     """
+    if use_graphbolt:
+        return gb.load_from_shared_memory(graph_name)
     g, ntypes, etypes = heterograph_index.create_heterograph_from_shared_memory(
         graph_name
     )
@@ -524,6 +529,8 @@ class DistGraph:
     part_config : str, optional
         The path of partition configuration file generated by
         :py:meth:`dgl.distributed.partition.partition_graph`. It's used in the standalone mode.
+    use_graphbolt : bool, optional
+        Whether to load GraphBolt partition. Default: False.
 
     Examples
     --------
@@ -557,9 +564,15 @@ class DistGraph:
     manually setting up servers and trainers. The setup is not fully tested yet.
     """
 
-    def __init__(self, graph_name, gpb=None, part_config=None):
+    def __init__(
+        self, graph_name, gpb=None, part_config=None, use_graphbolt=False
+    ):
         self.graph_name = graph_name
+        self._use_graphbolt = use_graphbolt
         if os.environ.get("DGL_DIST_MODE", "standalone") == "standalone":
+            assert (
+                use_graphbolt is False
+            ), "GraphBolt is not supported in standalone mode."
             assert (
                 part_config is not None
             ), "When running in the standalone model, the partition config file is required"
@@ -600,7 +613,9 @@ def __init__(self, graph_name, gpb=None, part_config=None):
             self._init(gpb)
             # Tell the backup servers to load the graph structure from shared memory.
             for server_id in range(self._client.num_servers):
-                rpc.send_request(server_id, InitGraphRequest(graph_name))
+                rpc.send_request(
+                    server_id, InitGraphRequest(graph_name, use_graphbolt)
+                )
             for server_id in range(self._client.num_servers):
                 rpc.recv_response()
             self._client.barrier()
@@ -625,7 +640,9 @@ def _init(self, gpb):
         assert (
             self._client is not None
         ), "Distributed module is not initialized. Please call dgl.distributed.initialize."
-        self._g = _get_graph_from_shared_mem(self.graph_name)
+        self._g = _get_graph_from_shared_mem(
+            self.graph_name, self._use_graphbolt
+        )
         self._gpb = get_shared_mem_partition_book(self.graph_name)
         if self._gpb is None:
             self._gpb = gpb
@@ -682,10 +699,10 @@ def _init_edata_store(self):
                 self._edata_store[etype] = data
 
     def __getstate__(self):
-        return self.graph_name, self._gpb
+        return self.graph_name, self._gpb, self._use_graphbolt
 
     def __setstate__(self, state):
-        self.graph_name, gpb = state
+        self.graph_name, gpb, self._use_graphbolt = state
         self._init(gpb)
 
         self._init_ndata_store()
@@ -1230,6 +1247,9 @@ def find_edges(self, edges, etype=None):
         tensor
             The destination node ID array.
         """
+        assert (
+            self._use_graphbolt is False
+        ), "find_edges is not supported in GraphBolt."
         if etype is None:
             assert (
                 len(self.etypes) == 1

diff --git a/python/dgl/distributed/partition.py b/python/dgl/distributed/partition.py
@@ -638,6 +638,8 @@ def partition_graph(
     num_trainers_per_machine=1,
     objtype="cut",
     graph_formats=None,
+    use_graphbolt=False,
+    **kwargs,
 ):
     """Partition a graph for distributed training and store the partitions on files.
 
@@ -811,6 +813,10 @@ def partition_graph(
         ``csc`` and ``csr``. If not specified, save one format only according to what
         format is available. If multiple formats are available, selection priority
         from high to low is ``coo``, ``csc``, ``csr``.
+    use_graphbolt : bool, optional
+        Whether to save partitions in GraphBolt format. Default: False.
+    kwargs : dict
+        Other keyword arguments for converting DGL partitions to GraphBolt.
 
     Returns
     -------
@@ -1298,7 +1304,8 @@ def get_homogeneous(g, balance_ntypes):
         )
     )
 
-    _dump_part_config(f"{out_path}/{graph_name}.json", part_metadata)
+    part_config = os.path.join(out_path, graph_name + ".json")
+    _dump_part_config(part_config, part_metadata)
 
     num_cuts = sim_g.num_edges() - tot_num_inner_edges
     if num_parts == 1:
@@ -1309,6 +1316,12 @@ def get_homogeneous(g, balance_ntypes):
         )
     )
 
+    if use_graphbolt:
+        dgl_partition_to_graphbolt(
+            part_config,
+            **kwargs,
+        )
+
     if return_mapping:
         return orig_nids, orig_eids
 

diff --git a/python/dgl/graphbolt/dataloader.py b/python/dgl/graphbolt/dataloader.py
@@ -1,6 +1,6 @@
 """Graph Bolt DataLoaders"""
 
-from queue import Queue
+from collections import deque
 
 import torch
 import torch.utils.data
@@ -69,18 +69,18 @@ def __init__(self, datapipe, buffer_size=1):
             raise ValueError(
                 "'buffer_size' is required to be a positive integer."
             )
-        self.buffer = Queue(buffer_size)
+        self.buffer = deque(maxlen=buffer_size)
 
     def __iter__(self):
         for data in self.datapipe:
-            if not self.buffer.full():
-                self.buffer.put(data)
+            if len(self.buffer) < self.buffer.maxlen:
+                self.buffer.append(data)
             else:
-                return_data = self.buffer.get()
-                self.buffer.put(data)
+                return_data = self.buffer.popleft()
+                self.buffer.append(data)
                 yield return_data
-        while not self.buffer.empty():
-            yield self.buffer.get()
+        while len(self.buffer) > 0:
+            yield self.buffer.popleft()
 
 
 class Awaiter(dp.iter.IterDataPipe):