Skip to content

Commit

Permalink
remove unnecessary dim3 usages
Browse files Browse the repository at this point in the history
  • Loading branch information
upsj committed Dec 9, 2021
1 parent 738ecc7 commit 9a33a05
Show file tree
Hide file tree
Showing 49 changed files with 519 additions and 569 deletions.
37 changes: 17 additions & 20 deletions cuda/factorization/factorization_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -87,10 +87,9 @@ void add_diagonal_elements(std::shared_ptr<const CudaExecutor> exec,
auto cuda_old_row_ptrs = as_cuda_type(mtx->get_row_ptrs());
auto cuda_row_ptrs_add = as_cuda_type(row_ptrs_addition.get_data());

const dim3 block_dim{default_block_size, 1, 1};
const dim3 grid_dim{
static_cast<uint32>(ceildiv(num_rows, block_dim.x / subwarp_size)), 1,
1};
const auto block_dim = default_block_size;
const auto grid_dim =
static_cast<uint32>(ceildiv(num_rows, block_dim / subwarp_size));
if (num_rows > 0) {
if (is_sorted) {
kernel::find_missing_diagonal_elements<true, subwarp_size>
Expand Down Expand Up @@ -131,8 +130,8 @@ void add_diagonal_elements(std::shared_ptr<const CudaExecutor> exec,
cuda_old_row_ptrs, cuda_new_values,
cuda_new_col_idxs, cuda_row_ptrs_add);

const dim3 grid_dim_row_ptrs_update{
static_cast<uint32>(ceildiv(num_rows, block_dim.x)), 1, 1};
const auto grid_dim_row_ptrs_update =
static_cast<uint32>(ceildiv(num_rows, block_dim));
kernel::update_row_ptrs<<<grid_dim_row_ptrs_update, block_dim>>>(
num_rows + 1, cuda_old_row_ptrs, cuda_row_ptrs_add);

Expand All @@ -153,10 +152,10 @@ void initialize_row_ptrs_l_u(
{
const size_type num_rows{system_matrix->get_size()[0]};

const dim3 block_size{default_block_size, 1, 1};
const auto block_size = default_block_size;
const uint32 number_blocks =
ceildiv(num_rows, static_cast<size_type>(block_size.x));
const dim3 grid_dim{number_blocks, 1, 1};
ceildiv(num_rows, static_cast<size_type>(block_size));
const auto grid_dim = number_blocks;

if (num_rows > 0) {
kernel::count_nnz_per_l_u_row<<<grid_dim, block_size, 0, 0>>>(
Expand All @@ -181,10 +180,9 @@ void initialize_l_u(std::shared_ptr<const CudaExecutor> exec,
matrix::Csr<ValueType, IndexType>* csr_u)
{
const size_type num_rows{system_matrix->get_size()[0]};
const dim3 block_size{default_block_size, 1, 1};
const dim3 grid_dim{static_cast<uint32>(ceildiv(
num_rows, static_cast<size_type>(block_size.x))),
1, 1};
const auto block_size = default_block_size;
const auto grid_dim = static_cast<uint32>(
ceildiv(num_rows, static_cast<size_type>(block_size)));

if (num_rows > 0) {
kernel::initialize_l_u<<<grid_dim, block_size, 0, 0>>>(
Expand Down Expand Up @@ -212,10 +210,10 @@ void initialize_row_ptrs_l(
{
const size_type num_rows{system_matrix->get_size()[0]};

const dim3 block_size{default_block_size, 1, 1};
const auto block_size = default_block_size;
const uint32 number_blocks =
ceildiv(num_rows, static_cast<size_type>(block_size.x));
const dim3 grid_dim{number_blocks, 1, 1};
ceildiv(num_rows, static_cast<size_type>(block_size));
const auto grid_dim = number_blocks;

if (num_rows > 0) {
kernel::count_nnz_per_l_row<<<grid_dim, block_size, 0, 0>>>(
Expand All @@ -238,10 +236,9 @@ void initialize_l(std::shared_ptr<const CudaExecutor> exec,
matrix::Csr<ValueType, IndexType>* csr_l, bool diag_sqrt)
{
const size_type num_rows{system_matrix->get_size()[0]};
const dim3 block_size{default_block_size, 1, 1};
const dim3 grid_dim{static_cast<uint32>(ceildiv(
num_rows, static_cast<size_type>(block_size.x))),
1, 1};
const auto block_size = default_block_size;
const auto grid_dim = static_cast<uint32>(
ceildiv(num_rows, static_cast<size_type>(block_size)));

if (num_rows > 0) {
kernel::initialize_l<<<grid_dim, block_size, 0, 0>>>(
Expand Down
10 changes: 4 additions & 6 deletions cuda/factorization/par_ilu_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -67,13 +67,11 @@ void compute_l_u_factors(std::shared_ptr<const CudaExecutor> exec,
{
iterations = (iterations == 0) ? 10 : iterations;
const auto num_elements = system_matrix->get_num_stored_elements();
const dim3 block_size{default_block_size, 1, 1};
const dim3 grid_dim{
static_cast<uint32>(
ceildiv(num_elements, static_cast<size_type>(block_size.x))),
1, 1};
const auto block_size = default_block_size;
const auto grid_dim = static_cast<uint32>(
ceildiv(num_elements, static_cast<size_type>(block_size)));
for (size_type i = 0; i < iterations; ++i) {
if (grid_dim.x > 0) {
if (grid_dim > 0) {
kernel::compute_l_u_factors<<<grid_dim, block_size, 0, 0>>>(
num_elements, system_matrix->get_const_row_idxs(),
system_matrix->get_const_col_idxs(),
Expand Down
14 changes: 7 additions & 7 deletions cuda/matrix/csr_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,8 @@ void merge_path_spmv(syn::value_list<int, items_per_thread>,
const IndexType total = a->get_size()[0] + a->get_num_stored_elements();
const IndexType grid_num =
ceildiv(total, spmv_block_size * items_per_thread);
const dim3 grid(grid_num);
const dim3 block(spmv_block_size);
const auto grid = grid_num;
const auto block = spmv_block_size;
Array<IndexType> row_out(exec, grid_num);
Array<ValueType> val_out(exec, grid_num);

Expand Down Expand Up @@ -227,7 +227,7 @@ void classical_spmv(syn::value_list<int, subwarp_size>,
std::min(ceildiv(a->get_size()[0], spmv_block_size / subwarp_size),
int64(nwarps / warps_in_block));
const dim3 grid(gridx, b->get_size()[1]);
const dim3 block(spmv_block_size);
const auto block = spmv_block_size;

if (alpha == nullptr && beta == nullptr) {
if (grid.x * grid.y > 0) {
Expand Down Expand Up @@ -953,9 +953,9 @@ void conj_transpose(std::shared_ptr<const CudaExecutor> exec,
matrix::Csr<ValueType, IndexType>* trans)
{
if (cusparse::is_supported<ValueType, IndexType>::value) {
const dim3 block_size(default_block_size, 1, 1);
const dim3 grid_size(
ceildiv(trans->get_num_stored_elements(), block_size.x), 1, 1);
const auto block_size = default_block_size;
const auto grid_size =
ceildiv(trans->get_num_stored_elements(), block_size);

#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
Expand Down Expand Up @@ -991,7 +991,7 @@ void conj_transpose(std::shared_ptr<const CudaExecutor> exec,
trans->get_row_ptrs(), trans->get_col_idxs(), cu_value, copyValues,
idxBase, alg, buffer);
#endif
if (grid_size.x > 0) {
if (grid_size > 0) {
conjugate_kernel<<<grid_size, block_size, 0, 0>>>(
trans->get_num_stored_elements(),
as_cuda_type(trans->get_values()));
Expand Down
6 changes: 3 additions & 3 deletions cuda/matrix/fbcsr_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -268,9 +268,9 @@ void transpose_blocks_impl(syn::value_list<int, mat_blk_sz>,
const size_type nbnz = mat->get_num_stored_blocks();
const size_type numthreads = nbnz * subwarp_size;
const size_type numblocks = ceildiv(numthreads, default_block_size);
const dim3 block_size{static_cast<unsigned>(default_block_size), 1, 1};
const dim3 grid_dim{static_cast<unsigned>(numblocks), 1, 1};
if (grid_dim.x > 0) {
const auto block_size = static_cast<unsigned>(default_block_size);
const auto grid_dim = static_cast<unsigned>(numblocks);
if (grid_dim > 0) {
kernel::transpose_blocks<mat_blk_sz, subwarp_size>
<<<grid_dim, block_size, 0, 0>>>(nbnz, mat->get_values());
}
Expand Down
4 changes: 2 additions & 2 deletions cuda/matrix/sellp_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ void spmv(std::shared_ptr<const CudaExecutor> exec,
const matrix::Sellp<ValueType, IndexType>* a,
const matrix::Dense<ValueType>* b, matrix::Dense<ValueType>* c)
{
const dim3 blockSize(default_block_size);
const auto blockSize = default_block_size;
const dim3 gridSize(ceildiv(a->get_size()[0], default_block_size),
b->get_size()[1]);

Expand All @@ -94,7 +94,7 @@ void advanced_spmv(std::shared_ptr<const CudaExecutor> exec,
const matrix::Dense<ValueType>* beta,
matrix::Dense<ValueType>* c)
{
const dim3 blockSize(default_block_size);
const auto blockSize = default_block_size;
const dim3 gridSize(ceildiv(a->get_size()[0], default_block_size),
b->get_size()[1]);

Expand Down
24 changes: 12 additions & 12 deletions cuda/multigrid/amgx_pgm_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,8 @@ void match_edge(std::shared_ptr<const CudaExecutor> exec,
Array<IndexType>& agg)
{
const auto num = agg.get_num_elems();
const dim3 grid(ceildiv(num, default_block_size));
if (grid.x > 0) {
const auto grid = ceildiv(num, default_block_size);
if (grid > 0) {
kernel::match_edge_kernel<<<grid, default_block_size>>>(
num, strongest_neighbor.get_const_data(), agg.get_data());
}
Expand All @@ -95,8 +95,8 @@ void count_unagg(std::shared_ptr<const CudaExecutor> exec,
const Array<IndexType>& agg, IndexType* num_unagg)
{
Array<IndexType> active_agg(exec, agg.get_num_elems());
const dim3 grid(ceildiv(active_agg.get_num_elems(), default_block_size));
if (grid.x > 0) {
const auto grid = ceildiv(active_agg.get_num_elems(), default_block_size);
if (grid > 0) {
kernel::activate_kernel<<<grid, default_block_size>>>(
active_agg.get_num_elems(), agg.get_const_data(),
active_agg.get_data());
Expand All @@ -114,13 +114,13 @@ void renumber(std::shared_ptr<const CudaExecutor> exec, Array<IndexType>& agg,
{
const auto num = agg.get_num_elems();
Array<IndexType> agg_map(exec, num + 1);
const dim3 grid(ceildiv(num, default_block_size));
if (grid.x > 0) {
const auto grid = ceildiv(num, default_block_size);
if (grid > 0) {
kernel::fill_agg_kernel<<<grid, default_block_size>>>(
num, agg.get_const_data(), agg_map.get_data());
}
components::prefix_sum(exec, agg_map.get_data(), agg_map.get_num_elems());
if (grid.x > 0) {
if (grid > 0) {
kernel::renumber_kernel<<<grid, default_block_size>>>(
num, agg_map.get_const_data(), agg.get_data());
}
Expand All @@ -138,8 +138,8 @@ void find_strongest_neighbor(
Array<IndexType>& strongest_neighbor)
{
const auto num = agg.get_num_elems();
const dim3 grid(ceildiv(num, default_block_size));
if (grid.x > 0) {
const auto grid = ceildiv(num, default_block_size);
if (grid > 0) {
kernel::find_strongest_neighbor_kernel<<<grid, default_block_size>>>(
num, weight_mtx->get_const_row_ptrs(),
weight_mtx->get_const_col_idxs(), weight_mtx->get_const_values(),
Expand All @@ -159,10 +159,10 @@ void assign_to_exist_agg(std::shared_ptr<const CudaExecutor> exec,
Array<IndexType>& intermediate_agg)
{
const auto num = agg.get_num_elems();
const dim3 grid(ceildiv(num, default_block_size));
const auto grid = ceildiv(num, default_block_size);
if (intermediate_agg.get_num_elems() > 0) {
// determinstic kernel
if (grid.x > 0) {
if (grid > 0) {
kernel::assign_to_exist_agg_kernel<<<grid, default_block_size>>>(
num, weight_mtx->get_const_row_ptrs(),
weight_mtx->get_const_col_idxs(),
Expand All @@ -173,7 +173,7 @@ void assign_to_exist_agg(std::shared_ptr<const CudaExecutor> exec,
agg = intermediate_agg;
} else {
// undeterminstic kernel
if (grid.x > 0) {
if (grid > 0) {
kernel::assign_to_exist_agg_kernel<<<grid, default_block_size>>>(
num, weight_mtx->get_const_row_ptrs(),
weight_mtx->get_const_col_idxs(),
Expand Down
30 changes: 15 additions & 15 deletions cuda/preconditioner/isai_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,9 @@ void generate_tri_inverse(std::shared_ptr<const DefaultExecutor> exec,
{
const auto num_rows = input->get_size()[0];

const dim3 block(default_block_size, 1, 1);
const dim3 grid(ceildiv(num_rows, block.x / subwarp_size), 1, 1);
if (grid.x > 0) {
const auto block = default_block_size;
const auto grid = ceildiv(num_rows, block / subwarp_size);
if (grid > 0) {
if (lower) {
kernel::generate_l_inverse<subwarp_size, subwarps_per_block>
<<<grid, block>>>(
Expand Down Expand Up @@ -120,9 +120,9 @@ void generate_general_inverse(std::shared_ptr<const DefaultExecutor> exec,
{
const auto num_rows = input->get_size()[0];

const dim3 block(default_block_size, 1, 1);
const dim3 grid(ceildiv(num_rows, block.x / subwarp_size), 1, 1);
if (grid.x > 0) {
const auto block = default_block_size;
const auto grid = ceildiv(num_rows, block / subwarp_size);
if (grid > 0) {
kernel::generate_general_inverse<subwarp_size, subwarps_per_block>
<<<grid, block>>>(static_cast<IndexType>(num_rows),
input->get_const_row_ptrs(),
Expand Down Expand Up @@ -152,9 +152,9 @@ void generate_excess_system(std::shared_ptr<const DefaultExecutor> exec,
{
const auto num_rows = input->get_size()[0];

const dim3 block(default_block_size, 1, 1);
const dim3 grid(ceildiv(e_end - e_start, block.x / subwarp_size), 1, 1);
if (grid.x > 0) {
const auto block = default_block_size;
const auto grid = ceildiv(e_end - e_start, block / subwarp_size);
if (grid > 0) {
kernel::generate_excess_system<subwarp_size><<<grid, block>>>(
static_cast<IndexType>(num_rows), input->get_const_row_ptrs(),
input->get_const_col_idxs(),
Expand All @@ -177,9 +177,9 @@ void scale_excess_solution(std::shared_ptr<const DefaultExecutor>,
matrix::Dense<ValueType>* excess_solution,
size_type e_start, size_type e_end)
{
const dim3 block(default_block_size, 1, 1);
const dim3 grid(ceildiv(e_end - e_start, block.x / subwarp_size), 1, 1);
if (grid.x > 0) {
const auto block = default_block_size;
const auto grid = ceildiv(e_end - e_start, block / subwarp_size);
if (grid > 0) {
kernel::scale_excess_solution<subwarp_size><<<grid, block>>>(
excess_block_ptrs, as_cuda_type(excess_solution->get_values()),
e_start, e_end);
Expand All @@ -199,9 +199,9 @@ void scatter_excess_solution(std::shared_ptr<const DefaultExecutor> exec,
{
const auto num_rows = inverse->get_size()[0];

const dim3 block(default_block_size, 1, 1);
const dim3 grid(ceildiv(e_end - e_start, block.x / subwarp_size), 1, 1);
if (grid.x > 0) {
const auto block = default_block_size;
const auto grid = ceildiv(e_end - e_start, block / subwarp_size);
if (grid > 0) {
kernel::copy_excess_solution<subwarp_size><<<grid, block>>>(
static_cast<IndexType>(num_rows), inverse->get_const_row_ptrs(),
excess_rhs_ptrs, as_cuda_type(excess_solution->get_const_values()),
Expand Down
6 changes: 3 additions & 3 deletions cuda/preconditioner/jacobi_advanced_apply_instantiate.inc.cu
Original file line number Diff line number Diff line change
Expand Up @@ -85,11 +85,11 @@ void advanced_apply(
{
constexpr int subwarp_size = get_larger_power(max_block_size);
constexpr int blocks_per_warp = config::warp_size / subwarp_size;
const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp),
1, 1);
const auto grid_size =
ceildiv(num_blocks, warps_per_block * blocks_per_warp);
const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block);

if (grid_size.x > 0) {
if (grid_size > 0) {
if (block_precisions) {
kernel::advanced_adaptive_apply<max_block_size, subwarp_size,
warps_per_block>
Expand Down
6 changes: 3 additions & 3 deletions cuda/preconditioner/jacobi_generate_instantiate.inc.cu
Original file line number Diff line number Diff line change
Expand Up @@ -88,11 +88,11 @@ void generate(syn::value_list<int, max_block_size>,
{
constexpr int subwarp_size = get_larger_power(max_block_size);
constexpr int blocks_per_warp = config::warp_size / subwarp_size;
const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp),
1, 1);
const auto grid_size =
ceildiv(num_blocks, warps_per_block * blocks_per_warp);
const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block);

if (grid_size.x > 0) {
if (grid_size > 0) {
if (block_precisions) {
kernel::adaptive_generate<max_block_size, subwarp_size,
warps_per_block>
Expand Down
14 changes: 7 additions & 7 deletions cuda/preconditioner/jacobi_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -79,11 +79,11 @@ size_type find_natural_blocks(std::shared_ptr<const DefaultExecutor> exec,

Array<bool> matching_next_row(exec, mtx->get_size()[0] - 1);

const dim3 block_size(config::warp_size, 1, 1);
const dim3 grid_size(
ceildiv(mtx->get_size()[0] * config::warp_size, block_size.x), 1, 1);
const auto block_size = config::warp_size;
const auto grid_size =
ceildiv(mtx->get_size()[0] * config::warp_size, block_size);

if (grid_size.x > 0) {
if (grid_size > 0) {
compare_adjacent_rows<<<grid_size, block_size, 0, 0>>>(
mtx->get_size()[0], max_block_size, mtx->get_const_row_ptrs(),
mtx->get_const_col_idxs(), matching_next_row.get_data());
Expand Down Expand Up @@ -161,11 +161,11 @@ void transpose_jacobi(
{
constexpr int subwarp_size = get_larger_power(max_block_size);
constexpr int blocks_per_warp = config::warp_size / subwarp_size;
const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp),
1, 1);
const auto grid_size =
ceildiv(num_blocks, warps_per_block * blocks_per_warp);
const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block);

if (grid_size.x > 0) {
if (grid_size > 0) {
if (block_precisions) {
adaptive_transpose_jacobi<conjugate, max_block_size, subwarp_size,
warps_per_block>
Expand Down
6 changes: 3 additions & 3 deletions cuda/preconditioner/jacobi_simple_apply_instantiate.inc.cu
Original file line number Diff line number Diff line change
Expand Up @@ -84,11 +84,11 @@ void apply(syn::value_list<int, max_block_size>, size_type num_blocks,
{
constexpr int subwarp_size = get_larger_power(max_block_size);
constexpr int blocks_per_warp = config::warp_size / subwarp_size;
const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp),
1, 1);
const auto grid_size =
ceildiv(num_blocks, warps_per_block * blocks_per_warp);
const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block);

if (grid_size.x > 0) {
if (grid_size > 0) {
if (block_precisions) {
kernel::adaptive_apply<max_block_size, subwarp_size,
warps_per_block>
Expand Down
Loading

0 comments on commit 9a33a05

Please sign in to comment.