diff --git a/graphbolt/src/cuda/index_select_csc_impl.cu b/graphbolt/src/cuda/index_select_csc_impl.cu index d1a6a89af18f..ce8af7a9f615 100644 --- a/graphbolt/src/cuda/index_select_csc_impl.cu +++ b/graphbolt/src/cuda/index_select_csc_impl.cu @@ -14,12 +14,13 @@ #include #include "./common.h" +#include "./max_uva_threads.h" #include "./utils.h" namespace graphbolt { namespace ops { -constexpr int BLOCK_SIZE = 128; +constexpr int BLOCK_SIZE = CUDA_MAX_NUM_THREADS; // Given the in_degree array and a permutation, returns in_degree of the output // and the permuted and modified in_degree of the input. The modified in_degree @@ -130,7 +131,10 @@ std::tuple UVAIndexSelectCSCCopyIndices( torch::Tensor output_indices = torch::empty(output_size.value(), options.dtype(indices.scalar_type())); const dim3 block(BLOCK_SIZE); - const dim3 grid((edge_count_aligned + BLOCK_SIZE - 1) / BLOCK_SIZE); + const dim3 grid( + (std::min(edge_count_aligned, cuda::max_uva_threads.value_or(1 << 20)) + + BLOCK_SIZE - 1) / + BLOCK_SIZE); // Find the smallest integer type to store the coo_aligned_rows tensor. const int num_bits = cuda::NumberOfBits(num_nodes); diff --git a/graphbolt/src/cuda/index_select_impl.cu b/graphbolt/src/cuda/index_select_impl.cu index 389d2430f227..43fd144848b0 100644 --- a/graphbolt/src/cuda/index_select_impl.cu +++ b/graphbolt/src/cuda/index_select_impl.cu @@ -131,7 +131,7 @@ torch::Tensor UVAIndexSelectImpl_(torch::Tensor input, torch::Tensor index) { IndexSelectSingleKernel, num_blocks, num_threads, 0, input_ptr, input_len, index_sorted_ptr, return_len, ret_ptr, permutation_ptr); } else { - constexpr int BLOCK_SIZE = 512; + constexpr int BLOCK_SIZE = CUDA_MAX_NUM_THREADS; dim3 block(BLOCK_SIZE, 1); while (static_cast(block.x) >= 2 * aligned_feature_size) { block.x >>= 1;