From 177dc133d685d69a89a775eb9d1ca720094e14c7 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Fri, 2 Feb 2024 08:16:52 +0300
Subject: [PATCH 1/2] [GraphBolt][CUDA] IndexSelectCSC kernel launch config
 change. (#7056)

---
 graphbolt/src/cuda/index_select_csc_impl.cu | 8 ++++++--
 graphbolt/src/cuda/index_select_impl.cu     | 2 +-
 2 files changed, 7 insertions(+), 3 deletions(-)
diff --git a/graphbolt/src/cuda/index_select_csc_impl.cu b/graphbolt/src/cuda/index_select_csc_impl.cu
index d1a6a89af18f..ce8af7a9f615 100644
--- a/graphbolt/src/cuda/index_select_csc_impl.cu
+++ b/graphbolt/src/cuda/index_select_csc_impl.cu
@@ -14,12 +14,13 @@
 #include <numeric>
 
 #include "./common.h"
+#include "./max_uva_threads.h"
 #include "./utils.h"
 
 namespace graphbolt {
 namespace ops {
 
-constexpr int BLOCK_SIZE = 128;
+constexpr int BLOCK_SIZE = CUDA_MAX_NUM_THREADS;
 
 // Given the in_degree array and a permutation, returns in_degree of the output
 // and the permuted and modified in_degree of the input. The modified in_degree
@@ -130,7 +131,10 @@ std::tuple<torch::Tensor, torch::Tensor> UVAIndexSelectCSCCopyIndices(
   torch::Tensor output_indices =
       torch::empty(output_size.value(), options.dtype(indices.scalar_type()));
   const dim3 block(BLOCK_SIZE);
-  const dim3 grid((edge_count_aligned + BLOCK_SIZE - 1) / BLOCK_SIZE);
+  const dim3 grid(
+      (std::min(edge_count_aligned, cuda::max_uva_threads.value_or(1 << 20)) +
+       BLOCK_SIZE - 1) /
+      BLOCK_SIZE);
 
   // Find the smallest integer type to store the coo_aligned_rows tensor.
   const int num_bits = cuda::NumberOfBits(num_nodes);
diff --git a/graphbolt/src/cuda/index_select_impl.cu b/graphbolt/src/cuda/index_select_impl.cu
index 389d2430f227..43fd144848b0 100644
--- a/graphbolt/src/cuda/index_select_impl.cu
+++ b/graphbolt/src/cuda/index_select_impl.cu
@@ -131,7 +131,7 @@ torch::Tensor UVAIndexSelectImpl_(torch::Tensor input, torch::Tensor index) {
         IndexSelectSingleKernel, num_blocks, num_threads, 0, input_ptr,
         input_len, index_sorted_ptr, return_len, ret_ptr, permutation_ptr);
   } else {
-    constexpr int BLOCK_SIZE = 512;
+    constexpr int BLOCK_SIZE = CUDA_MAX_NUM_THREADS;
     dim3 block(BLOCK_SIZE, 1);
     while (static_cast<int64_t>(block.x) >= 2 * aligned_feature_size) {
       block.x >>= 1;

From 8568386911075fe9113e4b5e40bfe3b36400831e Mon Sep 17 00:00:00 2001
From: Rhett Ying <85214957+Rhett-Ying@users.noreply.github.com>
Date: Fri, 2 Feb 2024 16:49:59 +0800
Subject: [PATCH 2/2] [dev] update system requirements (#7072)

---
 docs/source/install/index.rst | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/source/install/index.rst b/docs/source/install/index.rst
index 25249e8e36b9..23391ebce1cd 100644
--- a/docs/source/install/index.rst
+++ b/docs/source/install/index.rst
@@ -5,11 +5,13 @@ System requirements
 -------------------
 DGL works with the following operating systems:
 
-* Ubuntu 16.04
+* Ubuntu 20.04+
+* CentOS 8+
+* RHEL 8+
 * macOS X
 * Windows 10
 
-DGL requires Python version 3.6, 3.7, 3.8 or 3.9.
+DGL requires Python version 3.7, 3.8, 3.9, 3.10, 3.11.
 
 DGL supports multiple tensor libraries as backends, e.g., PyTorch, MXNet. For requirements on backends and how to select one, see :ref:`backends`.