Skip to content

Commit

Permalink
clean up count_nnz kernels
Browse files Browse the repository at this point in the history
  • Loading branch information
upsj committed Nov 22, 2021
1 parent 811f2c2 commit a100f16
Show file tree
Hide file tree
Showing 26 changed files with 108 additions and 363 deletions.
26 changes: 22 additions & 4 deletions common/unified/matrix/csr_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -153,10 +153,27 @@ void inv_scale(std::shared_ptr<const DefaultExecutor> exec,
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_SCALE_KERNEL);


template <typename ValueType, typename IndexType>
void count_nonzeros_per_row(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Csr<ValueType, IndexType>* source,
size_type* result)
{
run_kernel(
exec,
[] GKO_KERNEL(auto row, auto row_ptrs, auto nnz) {
nnz[row] = row_ptrs[row + 1] - row_ptrs[row];
},
source->get_size()[0], source->get_const_row_ptrs(), result);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_CSR_COUNT_NONZEROS_PER_ROW_KERNEL);


template <typename ValueType, typename IndexType>
void calculate_max_nnz_per_row(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Csr<ValueType, IndexType>* source,
size_type* result)
size_type& result)
{
Array<size_type> device_result{exec, 1};
run_kernel_reduction(
Expand All @@ -165,9 +182,10 @@ void calculate_max_nnz_per_row(std::shared_ptr<const DefaultExecutor> exec,
return static_cast<size_type>(row_ptrs[row + 1] - row_ptrs[row]);
},
[] GKO_KERNEL(auto a, auto b) { return a + b; },
[] GKO_KERNEL(auto a) { return a; }, size_type{}, result,
source->get_size()[0], source->get_const_row_ptrs());
*result = exec->copy_val_to_host(device_result.get_data());
[] GKO_KERNEL(auto a) { return a; }, size_type{},
device_result.get_data(), source->get_size()[0],
source->get_const_row_ptrs());
result = exec->copy_val_to_host(device_result.get_data());
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
Expand Down
19 changes: 19 additions & 0 deletions common/unified/matrix/dense_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,25 @@ void compute_norm1(std::shared_ptr<const DefaultExecutor> exec,
GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL);


template <typename ValueType>
void count_nonzeros_per_row(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Dense<ValueType>* mtx,
size_type* result)
{
run_kernel_row_reduction(
exec,
[] GKO_KERNEL(auto i, auto j, auto mtx) {
return mtx(i, j) != zero(mtx(i, j)) ? 1 : 0;
},
[] GKO_KERNEL(auto a, auto b) { return a > b ? a : b; },
[] GKO_KERNEL(auto a) { return a; }, size_type{}, result, 1,
mtx->get_size(), mtx);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL);


template <typename ValueType, typename IndexType>
void symm_permute(std::shared_ptr<const DefaultExecutor> exec,
const Array<IndexType>* permutation_indices,
Expand Down
4 changes: 2 additions & 2 deletions core/matrix/csr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ void Csr<ValueType, IndexType>::convert_to(
{
auto exec = this->get_executor();
Array<size_type> row_nnz(exec, this->get_size()[0]);
exec->run(csr::make_count_nonzeros_per_row(this, &row_nnz));
exec->run(csr::make_count_nonzeros_per_row(this, row_nnz.get_data()));
size_type ell_lim = zero<size_type>();
size_type coo_lim = zero<size_type>();
result->get_strategy()->compute_hybrid_config(row_nnz, &ell_lim, &coo_lim);
Expand Down Expand Up @@ -312,7 +312,7 @@ void Csr<ValueType, IndexType>::convert_to(
{
auto exec = this->get_executor();
size_type max_nnz_per_row;
exec->run(csr::make_calculate_max_nnz_per_row(this, &max_nnz_per_row));
exec->run(csr::make_calculate_max_nnz_per_row(this, max_nnz_per_row));
auto tmp = Ell<ValueType, IndexType>::create(exec, this->get_size(),
max_nnz_per_row);
exec->run(csr::make_convert_to_ell(this, tmp.get()));
Expand Down
5 changes: 2 additions & 3 deletions core/matrix/csr_kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -171,13 +171,12 @@ namespace kernels {
#define GKO_DECLARE_CSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL(ValueType, IndexType) \
void calculate_max_nnz_per_row( \
std::shared_ptr<const DefaultExecutor> exec, \
const matrix::Csr<ValueType, IndexType>* source, size_type* result)
const matrix::Csr<ValueType, IndexType>* source, size_type& result)

#define GKO_DECLARE_CSR_COUNT_NONZEROS_PER_ROW_KERNEL(ValueType, IndexType) \
void count_nonzeros_per_row( \
std::shared_ptr<const DefaultExecutor> exec, \
const matrix::Csr<ValueType, IndexType>* source, \
Array<size_type>* result)
const matrix::Csr<ValueType, IndexType>* source, size_type* result)

#define GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL(ValueType, IndexType) \
void calculate_nonzeros_per_row_in_span( \
Expand Down
4 changes: 2 additions & 2 deletions core/matrix/dense.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ inline void conversion_helper(Ell<ValueType, IndexType>* result,
auto exec = source->get_executor();
size_type num_stored_elements_per_row = 0;
exec->run(dense::make_calculate_max_nnz_per_row(
source, &num_stored_elements_per_row));
source, num_stored_elements_per_row));
const auto max_nnz_per_row = std::max(
result->get_num_stored_elements_per_row(), num_stored_elements_per_row);
const auto stride = std::max(result->get_stride(), source->get_size()[0]);
Expand All @@ -169,7 +169,7 @@ inline void conversion_helper(Hybrid<ValueType, IndexType>* result,
{
auto exec = source->get_executor();
Array<size_type> row_nnz(exec, source->get_size()[0]);
exec->run(dense::make_count_nonzeros_per_row(source, &row_nnz));
exec->run(dense::make_count_nonzeros_per_row(source, row_nnz.get_data()));
size_type ell_lim = zero<size_type>();
size_type coo_lim = zero<size_type>();
result->get_strategy()->compute_hybrid_config(row_nnz, &ell_lim, &coo_lim);
Expand Down
4 changes: 2 additions & 2 deletions core/matrix/dense_kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -169,12 +169,12 @@ namespace kernels {
#define GKO_DECLARE_DENSE_CALCULATE_MAX_NNZ_PER_ROW_KERNEL(_type) \
void calculate_max_nnz_per_row( \
std::shared_ptr<const DefaultExecutor> exec, \
const matrix::Dense<_type>* source, size_type* result)
const matrix::Dense<_type>* source, size_type& result)

#define GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL(_type) \
void count_nonzeros_per_row(std::shared_ptr<const DefaultExecutor> exec, \
const matrix::Dense<_type>* source, \
Array<size_type>* result)
size_type* result)

#define GKO_DECLARE_DENSE_CALCULATE_TOTAL_COLS_KERNEL(_type) \
void calculate_total_cols(std::shared_ptr<const DefaultExecutor> exec, \
Expand Down
50 changes: 0 additions & 50 deletions cuda/matrix/csr_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1225,39 +1225,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL);


template <typename ValueType, typename IndexType>
void calculate_max_nnz_per_row(std::shared_ptr<const CudaExecutor> exec,
const matrix::Csr<ValueType, IndexType>* source,
size_type* result)
{
const auto num_rows = source->get_size()[0];

auto nnz_per_row = Array<size_type>(exec, num_rows);
auto block_results = Array<size_type>(exec, default_block_size);
auto d_result = Array<size_type>(exec, 1);

const auto grid_dim = ceildiv(num_rows, default_block_size);
kernel::calculate_nnz_per_row<<<grid_dim, default_block_size>>>(
num_rows, as_cuda_type(source->get_const_row_ptrs()),
as_cuda_type(nnz_per_row.get_data()));

const auto n = ceildiv(num_rows, default_block_size);
const auto reduce_dim = n <= default_block_size ? n : default_block_size;
kernel::reduce_max_nnz<<<reduce_dim, default_block_size>>>(
num_rows, as_cuda_type(nnz_per_row.get_const_data()),
as_cuda_type(block_results.get_data()));

kernel::reduce_max_nnz<<<1, default_block_size>>>(
reduce_dim, as_cuda_type(block_results.get_const_data()),
as_cuda_type(d_result.get_data()));

*result = exec->copy_val_to_host(d_result.get_const_data());
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_CSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL);


template <typename ValueType, typename IndexType>
void calculate_nonzeros_per_row_in_span(
std::shared_ptr<const DefaultExecutor> exec,
Expand Down Expand Up @@ -1349,23 +1316,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL);


template <typename ValueType, typename IndexType>
void count_nonzeros_per_row(std::shared_ptr<const CudaExecutor> exec,
const matrix::Csr<ValueType, IndexType>* source,
Array<size_type>* result)
{
const auto num_rows = source->get_size()[0];
auto row_ptrs = source->get_const_row_ptrs();
auto grid_dim = ceildiv(num_rows, default_block_size);

kernel::calculate_nnz_per_row<<<grid_dim, default_block_size>>>(
num_rows, as_cuda_type(row_ptrs), as_cuda_type(result->get_data()));
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_CSR_COUNT_NONZEROS_PER_ROW_KERNEL);


template <typename ValueType, typename IndexType>
void sort_by_column_index(std::shared_ptr<const CudaExecutor> exec,
matrix::Csr<ValueType, IndexType>* to_sort)
Expand Down
25 changes: 2 additions & 23 deletions cuda/matrix/dense_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COUNT_NONZEROS_KERNEL);
template <typename ValueType>
void calculate_max_nnz_per_row(std::shared_ptr<const CudaExecutor> exec,
const matrix::Dense<ValueType>* source,
size_type* result)
size_type& result)
{
const auto num_rows = source->get_size()[0];
auto nnz_per_row = Array<size_type>(exec, num_rows);
Expand All @@ -332,34 +332,13 @@ void calculate_max_nnz_per_row(std::shared_ptr<const CudaExecutor> exec,
grid_dim, as_cuda_type(block_results.get_const_data()),
as_cuda_type(d_result.get_data()));

*result = exec->copy_val_to_host(d_result.get_const_data());
result = exec->copy_val_to_host(d_result.get_const_data());
}

GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
GKO_DECLARE_DENSE_CALCULATE_MAX_NNZ_PER_ROW_KERNEL);


template <typename ValueType>
void count_nonzeros_per_row(std::shared_ptr<const CudaExecutor> exec,
const matrix::Dense<ValueType>* source,
Array<size_type>* result)
{
const dim3 block_size(default_block_size, 1, 1);
auto rows_per_block = ceildiv(default_block_size, config::warp_size);
const size_t grid_x = ceildiv(source->get_size()[0], rows_per_block);
const dim3 grid_size(grid_x, 1, 1);
if (grid_x > 0) {
kernel::count_nnz_per_row<<<grid_size, block_size>>>(
source->get_size()[0], source->get_size()[1], source->get_stride(),
as_cuda_type(source->get_const_values()),
as_cuda_type(result->get_data()));
}
}

GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL);


template <typename ValueType>
void calculate_total_cols(std::shared_ptr<const CudaExecutor> exec,
const matrix::Dense<ValueType>* source,
Expand Down
8 changes: 4 additions & 4 deletions cuda/test/matrix/csr_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -596,9 +596,9 @@ TEST_F(Csr, CalculateMaxNnzPerRowIsEquivalentToRef)
gko::size_type dmax_nnz_per_row;

gko::kernels::reference::csr::calculate_max_nnz_per_row(ref, mtx.get(),
&max_nnz_per_row);
max_nnz_per_row);
gko::kernels::cuda::csr::calculate_max_nnz_per_row(cuda, dmtx.get(),
&dmax_nnz_per_row);
dmax_nnz_per_row);

ASSERT_EQ(max_nnz_per_row, dmax_nnz_per_row);
}
Expand Down Expand Up @@ -690,9 +690,9 @@ TEST_F(Csr, CalculatesNonzerosPerRow)
gko::Array<gko::size_type> drow_nnz(cuda, dmtx->get_size()[0]);

gko::kernels::reference::csr::count_nonzeros_per_row(ref, mtx.get(),
&row_nnz);
row_nnz.get_data());
gko::kernels::cuda::csr::count_nonzeros_per_row(cuda, dmtx.get(),
&drow_nnz);
drow_nnz.get_data());

GKO_ASSERT_ARRAY_EQ(row_nnz, drow_nnz);
}
Expand Down
10 changes: 5 additions & 5 deletions cuda/test/matrix/dense_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -522,10 +522,10 @@ TEST_F(Dense, CalculateNNZPerRowIsEquivalentToRef)
gko::Array<gko::size_type> dnnz_per_row(cuda);
dnnz_per_row.resize_and_reset(dx->get_size()[0]);

gko::kernels::reference::dense::count_nonzeros_per_row(ref, x.get(),
&nnz_per_row);
gko::kernels::reference::dense::count_nonzeros_per_row(
ref, x.get(), nnz_per_row.get_data());
gko::kernels::cuda::dense::count_nonzeros_per_row(cuda, dx.get(),
&dnnz_per_row);
dnnz_per_row.get_data());

auto tmp = gko::Array<gko::size_type>(ref, dnnz_per_row);
for (gko::size_type i = 0; i < nnz_per_row.get_num_elems(); i++) {
Expand All @@ -541,9 +541,9 @@ TEST_F(Dense, CalculateMaxNNZPerRowIsEquivalentToRef)
gko::size_type dmax_nnz;

gko::kernels::reference::dense::calculate_max_nnz_per_row(ref, x.get(),
&max_nnz);
max_nnz);
gko::kernels::cuda::dense::calculate_max_nnz_per_row(cuda, dx.get(),
&dmax_nnz);
dmax_nnz);

ASSERT_EQ(max_nnz, dmax_nnz);
}
Expand Down
51 changes: 0 additions & 51 deletions dpcpp/matrix/csr_kernels.dp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2537,39 +2537,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL);


template <typename ValueType, typename IndexType>
void calculate_max_nnz_per_row(std::shared_ptr<const DpcppExecutor> exec,
const matrix::Csr<ValueType, IndexType>* source,
size_type* result)
{
const auto num_rows = source->get_size()[0];

auto nnz_per_row = Array<size_type>(exec, num_rows);
auto block_results = Array<size_type>(exec, default_block_size);
auto d_result = Array<size_type>(exec, 1);

const auto grid_dim = ceildiv(num_rows, default_block_size);
kernel::calculate_nnz_per_row(
grid_dim, default_block_size, 0, exec->get_queue(), num_rows,
source->get_const_row_ptrs(), nnz_per_row.get_data());

const auto n = ceildiv(num_rows, default_block_size);
const auto reduce_dim = n <= default_block_size ? n : default_block_size;
kernel::reduce_max_nnz(reduce_dim, default_block_size, 0, exec->get_queue(),
num_rows, nnz_per_row.get_const_data(),
block_results.get_data());

kernel::reduce_max_nnz(1, default_block_size, 0, exec->get_queue(),
reduce_dim, block_results.get_const_data(),
d_result.get_data());

*result = exec->copy_val_to_host(d_result.get_const_data());
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_CSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL);


template <typename ValueType, typename IndexType>
void convert_to_hybrid(std::shared_ptr<const DpcppExecutor> exec,
const matrix::Csr<ValueType, IndexType>* source,
Expand Down Expand Up @@ -2611,24 +2578,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL);


template <typename ValueType, typename IndexType>
void count_nonzeros_per_row(std::shared_ptr<const DpcppExecutor> exec,
const matrix::Csr<ValueType, IndexType>* source,
Array<size_type>* result)
{
const auto num_rows = source->get_size()[0];
auto row_ptrs = source->get_const_row_ptrs();
auto grid_dim = ceildiv(num_rows, default_block_size);

kernel::calculate_nnz_per_row(grid_dim, default_block_size, 0,
exec->get_queue(), num_rows, row_ptrs,
result->get_data());
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_CSR_COUNT_NONZEROS_PER_ROW_KERNEL);


template <typename ValueType, typename IndexType>
void sort_by_column_index(std::shared_ptr<const DpcppExecutor> exec,
matrix::Csr<ValueType, IndexType>* to_sort)
Expand Down
Loading

0 comments on commit a100f16

Please sign in to comment.