From b1c77eb8bab46cd983f9069ed976fd89038e506e Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 12 Oct 2021 13:12:58 +0200 Subject: [PATCH 01/32] avoid calling empty GEMMs for CUDA/HIP --- cuda/matrix/dense_kernels.cu | 40 +++++++++++++++++++------------ hip/matrix/dense_kernels.hip.cpp | 41 ++++++++++++++++++++------------ 2 files changed, 51 insertions(+), 30 deletions(-) diff --git a/cuda/matrix/dense_kernels.cu b/cuda/matrix/dense_kernels.cu index 7e14375a6e4..083b3de407f 100644 --- a/cuda/matrix/dense_kernels.cu +++ b/cuda/matrix/dense_kernels.cu @@ -78,15 +78,19 @@ void simple_apply(std::shared_ptr exec, { if (cublas::is_supported::value) { auto handle = exec->get_cublas_handle(); - { - cublas::pointer_mode_guard pm_guard(handle); - auto alpha = one(); - auto beta = zero(); - cublas::gemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, c->get_size()[1], - c->get_size()[0], a->get_size()[1], &alpha, - b->get_const_values(), b->get_stride(), - a->get_const_values(), a->get_stride(), &beta, - c->get_values(), c->get_stride()); + if (c->get_size()[0] * c->get_size()[1] > 0) { + if (a->get_size()[1] > 0) { + cublas::pointer_mode_guard pm_guard(handle); + auto alpha = one(); + auto beta = zero(); + cublas::gemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, c->get_size()[1], + c->get_size()[0], a->get_size()[1], &alpha, + b->get_const_values(), b->get_stride(), + a->get_const_values(), a->get_stride(), &beta, + c->get_values(), c->get_stride()); + } else { + dense::fill(exec, c, zero()); + } } } else { GKO_NOT_IMPLEMENTED; @@ -103,12 +107,18 @@ void apply(std::shared_ptr exec, const matrix::Dense* beta, matrix::Dense* c) { if (cublas::is_supported::value) { - cublas::gemm(exec->get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N, - c->get_size()[1], c->get_size()[0], a->get_size()[1], - alpha->get_const_values(), b->get_const_values(), - b->get_stride(), a->get_const_values(), a->get_stride(), - beta->get_const_values(), c->get_values(), - c->get_stride()); + if (c->get_size()[0] * c->get_size()[1] > 0) { + if (a->get_size()[1] > 0) { + cublas::gemm( + exec->get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N, + c->get_size()[1], c->get_size()[0], a->get_size()[1], + alpha->get_const_values(), b->get_const_values(), + b->get_stride(), a->get_const_values(), a->get_stride(), + beta->get_const_values(), c->get_values(), c->get_stride()); + } else { + dense::scale(exec, beta, c); + } + } } else { GKO_NOT_IMPLEMENTED; } diff --git a/hip/matrix/dense_kernels.hip.cpp b/hip/matrix/dense_kernels.hip.cpp index 971e72a2d07..e24c625bd45 100644 --- a/hip/matrix/dense_kernels.hip.cpp +++ b/hip/matrix/dense_kernels.hip.cpp @@ -81,15 +81,20 @@ void simple_apply(std::shared_ptr exec, { if (hipblas::is_supported::value) { auto handle = exec->get_hipblas_handle(); - { - hipblas::pointer_mode_guard pm_guard(handle); - auto alpha = one(); - auto beta = zero(); - hipblas::gemm(handle, HIPBLAS_OP_N, HIPBLAS_OP_N, c->get_size()[1], - c->get_size()[0], a->get_size()[1], &alpha, - b->get_const_values(), b->get_stride(), - a->get_const_values(), a->get_stride(), &beta, - c->get_values(), c->get_stride()); + if (c->get_size()[0] * c->get_size()[1] > 0) { + if (a->get_size()[1] > 0) { + hipblas::pointer_mode_guard pm_guard(handle); + auto alpha = one(); + auto beta = zero(); + hipblas::gemm(handle, HIPBLAS_OP_N, HIPBLAS_OP_N, + c->get_size()[1], c->get_size()[0], + a->get_size()[1], &alpha, b->get_const_values(), + b->get_stride(), a->get_const_values(), + a->get_stride(), &beta, c->get_values(), + c->get_stride()); + } else { + dense::fill(exec, c, zero()); + } } } else { GKO_NOT_IMPLEMENTED; @@ -106,12 +111,18 @@ void apply(std::shared_ptr exec, const matrix::Dense* beta, matrix::Dense* c) { if (hipblas::is_supported::value) { - hipblas::gemm(exec->get_hipblas_handle(), HIPBLAS_OP_N, HIPBLAS_OP_N, - c->get_size()[1], c->get_size()[0], a->get_size()[1], - alpha->get_const_values(), b->get_const_values(), - b->get_stride(), a->get_const_values(), a->get_stride(), - beta->get_const_values(), c->get_values(), - c->get_stride()); + if (c->get_size()[0] * c->get_size()[1] > 0) { + if (a->get_size()[1] > 0) { + hipblas::gemm( + exec->get_hipblas_handle(), HIPBLAS_OP_N, HIPBLAS_OP_N, + c->get_size()[1], c->get_size()[0], a->get_size()[1], + alpha->get_const_values(), b->get_const_values(), + b->get_stride(), a->get_const_values(), a->get_stride(), + beta->get_const_values(), c->get_values(), c->get_stride()); + } else { + dense::scale(exec, beta, c); + } + } } else { GKO_NOT_IMPLEMENTED; } From 3afdab6d1665d6cb9ebd6159672e4a7ad075b916 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 12 Oct 2021 13:15:18 +0200 Subject: [PATCH 02/32] simplify matrix write impl --- core/matrix/dense.cpp | 1 - core/matrix/diagonal.cpp | 9 +-------- core/matrix/ell.cpp | 9 +-------- core/matrix/fbcsr.cpp | 9 +-------- core/matrix/sellp.cpp | 9 +-------- core/matrix/sparsity_csr.cpp | 9 +-------- 6 files changed, 5 insertions(+), 41 deletions(-) diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp index 67f53542c07..8388dd58f09 100644 --- a/core/matrix/dense.cpp +++ b/core/matrix/dense.cpp @@ -723,7 +723,6 @@ namespace { template inline void write_impl(const MatrixType* mtx, MatrixData& data) { - std::unique_ptr op{}; auto tmp = make_temporary_clone(mtx->get_executor()->get_master(), mtx); data = {mtx->get_size(), {}}; diff --git a/core/matrix/diagonal.cpp b/core/matrix/diagonal.cpp index 5102c3134b1..03cd801818e 100644 --- a/core/matrix/diagonal.cpp +++ b/core/matrix/diagonal.cpp @@ -283,14 +283,7 @@ namespace { template inline void write_impl(const MatrixType* mtx, MatrixData& data) { - std::unique_ptr op{}; - const MatrixType* tmp{}; - if (mtx->get_executor()->get_master() != mtx->get_executor()) { - op = mtx->clone(mtx->get_executor()->get_master()); - tmp = static_cast(op.get()); - } else { - tmp = mtx; - } + auto tmp = make_temporary_clone(mtx->get_executor()->get_master(), mtx); data = {tmp->get_size(), {}}; const auto values = tmp->get_const_values(); diff --git a/core/matrix/ell.cpp b/core/matrix/ell.cpp index e6ec26c7a22..782e51523c5 100644 --- a/core/matrix/ell.cpp +++ b/core/matrix/ell.cpp @@ -229,14 +229,7 @@ void Ell::read(const mat_data& data) template void Ell::write(mat_data& data) const { - std::unique_ptr op{}; - const Ell* tmp{}; - if (this->get_executor()->get_master() != this->get_executor()) { - op = this->clone(this->get_executor()->get_master()); - tmp = static_cast(op.get()); - } else { - tmp = this; - } + auto tmp = make_temporary_clone(this->get_executor()->get_master(), this); data = {tmp->get_size(), {}}; diff --git a/core/matrix/fbcsr.cpp b/core/matrix/fbcsr.cpp index 69e8956490c..a80b0cfaceb 100644 --- a/core/matrix/fbcsr.cpp +++ b/core/matrix/fbcsr.cpp @@ -244,14 +244,7 @@ void Fbcsr::read(const mat_data& data) template void Fbcsr::write(mat_data& data) const { - std::unique_ptr op{}; - const Fbcsr* tmp{}; - if (this->get_executor()->get_master() != this->get_executor()) { - op = this->clone(this->get_executor()->get_master()); - tmp = static_cast(op.get()); - } else { - tmp = this; - } + auto tmp = make_temporary_clone(this->get_executor()->get_master(), this); data = {tmp->get_size(), {}}; diff --git a/core/matrix/sellp.cpp b/core/matrix/sellp.cpp index cd16daba607..0dfb8dc0ff5 100644 --- a/core/matrix/sellp.cpp +++ b/core/matrix/sellp.cpp @@ -197,14 +197,7 @@ void Sellp::read(const mat_data& data) template void Sellp::write(mat_data& data) const { - std::unique_ptr op{}; - const Sellp* tmp{}; - if (this->get_executor()->get_master() != this->get_executor()) { - op = this->clone(this->get_executor()->get_master()); - tmp = static_cast(op.get()); - } else { - tmp = this; - } + auto tmp = make_temporary_clone(this->get_executor()->get_master(), this); data = {tmp->get_size(), {}}; diff --git a/core/matrix/sparsity_csr.cpp b/core/matrix/sparsity_csr.cpp index 72cdd4ba7b1..144f1b4ec6a 100644 --- a/core/matrix/sparsity_csr.cpp +++ b/core/matrix/sparsity_csr.cpp @@ -125,14 +125,7 @@ void SparsityCsr::read(const mat_data& data) template void SparsityCsr::write(mat_data& data) const { - std::unique_ptr op{}; - const SparsityCsr* tmp{}; - if (this->get_executor()->get_master() != this->get_executor()) { - op = this->clone(this->get_executor()->get_master()); - tmp = static_cast(op.get()); - } else { - tmp = this; - } + auto tmp = make_temporary_clone(this->get_executor()->get_master(), this); data = {tmp->get_size(), {}}; From e4591d10cf1ede91b8990f41de3a3f61e6d5a6ba Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 12 Oct 2021 14:30:07 +0200 Subject: [PATCH 03/32] initialize Csr arrays to valid values --- include/ginkgo/core/matrix/csr.hpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/ginkgo/core/matrix/csr.hpp b/include/ginkgo/core/matrix/csr.hpp index fe148f3bc4f..5a4a3768920 100644 --- a/include/ginkgo/core/matrix/csr.hpp +++ b/include/ginkgo/core/matrix/csr.hpp @@ -985,7 +985,10 @@ class Csr : public EnableLinOp>, row_ptrs_(exec, size[0] + 1), srow_(exec, strategy->clac_size(num_nonzeros)), strategy_(strategy->copy()) - {} + { + row_ptrs_.fill(0); + this->make_srow(); + } /** * Creates a CSR matrix from already allocated (and initialized) row From e362d4fc9f7b257081a8c7178b439b6c0e18f29a Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 13 Oct 2021 14:56:40 +0200 Subject: [PATCH 04/32] fix CUDA compilation implicit include paths --- cuda/CMakeLists.txt | 3 ++- cuda/test/preconditioner/isai_kernels.cpp | 1 - cuda/test/solver/lower_trs_kernels.cpp | 3 --- cuda/test/solver/upper_trs_kernels.cpp | 3 --- 4 files changed, 2 insertions(+), 8 deletions(-) diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index fd1ed8ca3a4..4945056790b 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -193,7 +193,8 @@ target_compile_options(ginkgo_cuda PRIVATE $<$:${GINKGO_CO ginkgo_compile_features(ginkgo_cuda) target_compile_definitions(ginkgo_cuda PRIVATE GKO_COMPILING_CUDA) target_include_directories(ginkgo_cuda - SYSTEM PRIVATE ${CUDA_INCLUDE_DIRS}) + SYSTEM PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) + # include path for generated headers like jacobi_common.hpp target_include_directories(ginkgo_cuda PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/..) diff --git a/cuda/test/preconditioner/isai_kernels.cpp b/cuda/test/preconditioner/isai_kernels.cpp index 8a32a45b536..f50b5f718b7 100644 --- a/cuda/test/preconditioner/isai_kernels.cpp +++ b/cuda/test/preconditioner/isai_kernels.cpp @@ -48,7 +48,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/preconditioner/isai_kernels.hpp" -#include "cuda/base/config.hpp" #include "cuda/test/utils.hpp" #include "matrices/config.hpp" diff --git a/cuda/test/solver/lower_trs_kernels.cpp b/cuda/test/solver/lower_trs_kernels.cpp index 485891f16e5..59ad0ab0cfc 100644 --- a/cuda/test/solver/lower_trs_kernels.cpp +++ b/cuda/test/solver/lower_trs_kernels.cpp @@ -37,9 +37,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include - - #include diff --git a/cuda/test/solver/upper_trs_kernels.cpp b/cuda/test/solver/upper_trs_kernels.cpp index 9e6566f9030..bfd81b72675 100644 --- a/cuda/test/solver/upper_trs_kernels.cpp +++ b/cuda/test/solver/upper_trs_kernels.cpp @@ -37,9 +37,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include - - #include From 628e78973408230f2a16f77ef5735e2db75a3709 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 13 Oct 2021 14:57:26 +0200 Subject: [PATCH 05/32] add CSR SpMM support, make `classical` fall-back --- cuda/base/cusparse_bindings.hpp | 47 +++- cuda/base/types.hpp | 94 +++---- cuda/matrix/csr_kernels.cu | 381 +++++++++++++++-------------- cuda/solver/common_trs_kernels.cuh | 11 +- 4 files changed, 282 insertions(+), 251 deletions(-) diff --git a/cuda/base/cusparse_bindings.hpp b/cuda/base/cusparse_bindings.hpp index 4bec360f0a7..b3af4e226a0 100644 --- a/cuda/base/cusparse_bindings.hpp +++ b/cuda/base/cusparse_bindings.hpp @@ -166,6 +166,34 @@ inline void spmv(cusparseHandle_t handle, cusparseOperation_t opA, } +template +inline void spmm_buffersize(cusparseHandle_t handle, cusparseOperation_t opB, + cusparseOperation_t opA, const ValueType* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t vecX, + const ValueType* beta, + const cusparseDnMatDescr_t vecY, + cusparseSpMMAlg_t alg, size_type* bufferSize) +{ + constexpr auto value_type = cuda_data_type(); + cusparseSpMM_bufferSize(handle, opA, opB, alpha, matA, vecX, beta, vecY, + value_type, alg, bufferSize); +} + +template +inline void spmm(cusparseHandle_t handle, cusparseOperation_t opA, + cusparseOperation_t opB, const ValueType* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t vecX, const ValueType* beta, + const cusparseDnMatDescr_t vecY, cusparseSpMMAlg_t alg, + void* externalBuffer) +{ + constexpr auto value_type = cuda_data_type(); + cusparseSpMM(handle, opA, opB, alpha, matA, vecX, beta, vecY, value_type, + alg, externalBuffer); +} + + #endif @@ -813,24 +841,25 @@ inline cusparseDnVecDescr_t create_dnvec(int64 size, ValueType* values) } -inline void destroy(cusparseDnVecDescr_t descr) -{ - GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyDnVec(descr)); -} - - template -inline cusparseDnMatDescr_t create_dnmat(int64 rows, int64 cols, int64 stride, +inline cusparseDnMatDescr_t create_dnmat(gko::dim<2> size, size_type stride, ValueType* values) { cusparseDnMatDescr_t descr{}; constexpr auto value_type = cuda_data_type(); - GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreateDnMat( - &descr, rows, cols, stride, values, value_type, CUSPARSE_ORDER_ROW)); + GKO_ASSERT_NO_CUSPARSE_ERRORS( + cusparseCreateDnMat(&descr, size[0], size[1], stride, values, + value_type, CUSPARSE_ORDER_ROW)); return descr; } +inline void destroy(cusparseDnVecDescr_t descr) +{ + GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyDnVec(descr)); +} + + inline void destroy(cusparseDnMatDescr_t descr) { GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyDnMat(descr)); diff --git a/cuda/base/types.hpp b/cuda/base/types.hpp index b7c4b1712f8..8c8df0c912b 100644 --- a/cuda/base/types.hpp +++ b/cuda/base/types.hpp @@ -210,64 +210,42 @@ struct cuda_type_impl> { template -constexpr cudaDataType_t cuda_data_type_impl() -{ - return CUDA_C_8U; -} - -template <> -constexpr cudaDataType_t cuda_data_type_impl() -{ - return CUDA_R_16F; -} +struct cuda_data_type_impl {}; template <> -constexpr cudaDataType_t cuda_data_type_impl() -{ - return CUDA_R_32F; -} - -template <> -constexpr cudaDataType_t cuda_data_type_impl() -{ - return CUDA_R_64F; -} +struct cuda_data_type_impl { + constexpr static cudaDataType_t value = CUDA_R_16F; +}; template <> -constexpr cudaDataType_t cuda_data_type_impl>() -{ - return CUDA_C_32F; -} +struct cuda_data_type_impl { + constexpr static cudaDataType_t value = CUDA_R_32F; +}; template <> -constexpr cudaDataType_t cuda_data_type_impl>() -{ - return CUDA_C_64F; -} +struct cuda_data_type_impl { + constexpr static cudaDataType_t value = CUDA_R_64F; +}; template <> -constexpr cudaDataType_t cuda_data_type_impl() -{ - return CUDA_R_32I; -} +struct cuda_data_type_impl> { + constexpr static cudaDataType_t value = CUDA_C_32F; +}; template <> -constexpr cudaDataType_t cuda_data_type_impl() -{ - return CUDA_R_32U; -} +struct cuda_data_type_impl> { + constexpr static cudaDataType_t value = CUDA_C_64F; +}; template <> -constexpr cudaDataType_t cuda_data_type_impl() -{ - return CUDA_R_8I; -} +struct cuda_data_type_impl { + constexpr static cudaDataType_t value = CUDA_R_32I; +}; template <> -constexpr cudaDataType_t cuda_data_type_impl() -{ - return CUDA_R_8U; -} +struct cuda_data_type_impl { + constexpr static cudaDataType_t value = CUDA_R_8I; +}; #if defined(CUDA_VERSION) && \ @@ -276,22 +254,22 @@ constexpr cudaDataType_t cuda_data_type_impl() template -constexpr cusparseIndexType_t cusparse_index_type_impl() -{ - return CUSPARSE_INDEX_16U; -} +struct cusparse_index_type_impl {}; template <> -constexpr cusparseIndexType_t cusparse_index_type_impl() -{ - return CUSPARSE_INDEX_32I; -} +struct cusparse_index_type_impl { + constexpr static cusparseIndexType_t value = CUSPARSE_INDEX_16U; +}; template <> -constexpr cusparseIndexType_t cusparse_index_type_impl() -{ - return CUSPARSE_INDEX_64I; -} +struct cusparse_index_type_impl { + constexpr static cusparseIndexType_t value = CUSPARSE_INDEX_32I; +}; + +template <> +struct cusparse_index_type_impl { + constexpr static cusparseIndexType_t value = CUSPARSE_INDEX_64I; +}; #endif // defined(CUDA_VERSION) && (CUDA_VERSION >= 11000 || ((CUDA_VERSION >= @@ -312,7 +290,7 @@ constexpr cusparseIndexType_t cusparse_index_type_impl() template constexpr cudaDataType_t cuda_data_type() { - return detail::cuda_data_type_impl(); + return detail::cuda_data_type_impl::value; } @@ -332,7 +310,7 @@ constexpr cudaDataType_t cuda_data_type() template constexpr cusparseIndexType_t cusparse_index_type() { - return detail::cusparse_index_type_impl(); + return detail::cusparse_index_type_impl::value; } diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.cu index e25c923a1ae..4b6415c0b65 100644 --- a/cuda/matrix/csr_kernels.cu +++ b/cuda/matrix/csr_kernels.cu @@ -100,6 +100,7 @@ using spgeam_kernels = namespace host_kernel { +namespace { template @@ -246,24 +247,27 @@ void classical_spmv(syn::value_list, GKO_ENABLE_IMPLEMENTATION_SELECTION(select_classical_spmv, classical_spmv); -} // namespace host_kernel - - template -void spmv(std::shared_ptr exec, - const matrix::Csr* a, - const matrix::Dense* b, matrix::Dense* c) +void load_balance_spmv(std::shared_ptr exec, + const matrix::Csr* a, + const matrix::Dense* b, + matrix::Dense* c, + const matrix::Dense* alpha = nullptr, + const matrix::Dense* beta = nullptr) { - if (a->get_strategy()->get_name() == "load_balance") { - components::fill_array(exec, c->get_values(), - c->get_num_stored_elements(), zero()); - const IndexType nwarps = a->get_num_srow_elements(); - if (nwarps > 0) { - const dim3 csr_block(config::warp_size, warps_in_block, 1); - const dim3 csr_grid(ceildiv(nwarps, warps_in_block), - b->get_size()[1]); + if (beta) { + dense::scale(exec, beta, c); + } else { + dense::fill(exec, c, zero()); + } + const IndexType nwarps = a->get_num_srow_elements(); + if (nwarps > 0) { + const dim3 csr_block(config::warp_size, warps_in_block, 1); + const dim3 csr_grid(ceildiv(nwarps, warps_in_block), b->get_size()[1]); + if (alpha) { kernel::abstract_spmv<<>>( nwarps, static_cast(a->get_size()[0]), + as_cuda_type(alpha->get_const_values()), as_cuda_type(a->get_const_values()), a->get_const_col_idxs(), as_cuda_type(a->get_const_row_ptrs()), as_cuda_type(a->get_const_srow()), @@ -271,8 +275,128 @@ void spmv(std::shared_ptr exec, as_cuda_type(b->get_stride()), as_cuda_type(c->get_values()), as_cuda_type(c->get_stride())); } else { - GKO_NOT_SUPPORTED(nwarps); + kernel::abstract_spmv<<>>( + nwarps, static_cast(a->get_size()[0]), + as_cuda_type(a->get_const_values()), a->get_const_col_idxs(), + as_cuda_type(a->get_const_row_ptrs()), + as_cuda_type(a->get_const_srow()), + as_cuda_type(b->get_const_values()), + as_cuda_type(b->get_stride()), as_cuda_type(c->get_values()), + as_cuda_type(c->get_stride())); } + } else { + GKO_NOT_SUPPORTED(nwarps); + } +} + + +template +bool try_general_sparselib_spmv(std::shared_ptr exec, + const ValueType* alpha, + const matrix::Csr* a, + const matrix::Dense* b, + const ValueType* beta, + matrix::Dense* c) +{ + auto handle = exec->get_cusparse_handle(); +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + if (!cusparse::is_supported::value || + b->get_stride() != 1 || c->get_stride() != 1 || b->get_size()[0] == 0 || + c->get_size()[0] == 0) { + return false; + } + + auto descr = cusparse::create_mat_descr(); + auto row_ptrs = a->get_const_row_ptrs(); + auto col_idxs = a->get_const_col_idxs(); + cusparse::spmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, a->get_size()[0], + a->get_size()[1], a->get_num_stored_elements(), alpha, descr, + a->get_const_values(), row_ptrs, col_idxs, + b->get_const_values(), beta, c->get_values()); + + cusparse::destroy(descr); +#else // CUDA_VERSION >= 11000 + cusparseOperation_t trans = CUSPARSE_OPERATION_NON_TRANSPOSE; + auto row_ptrs = const_cast(a->get_const_row_ptrs()); + auto col_idxs = const_cast(a->get_const_col_idxs()); + auto values = const_cast(a->get_const_values()); + auto mat = cusparse::create_csr(a->get_size()[0], a->get_size()[1], + a->get_num_stored_elements(), row_ptrs, + col_idxs, values); + auto b_val = const_cast(b->get_const_values()); + auto c_val = c->get_values(); + if (b->get_stride() == 1 && c->get_stride() == 1) { + auto vecb = cusparse::create_dnvec(b->get_size()[0], b_val); + auto vecc = cusparse::create_dnvec(c->get_size()[0], c_val); + cusparseSpMVAlg_t alg = CUSPARSE_CSRMV_ALG1; + size_type buffer_size = 0; + cusparse::spmv_buffersize(handle, trans, alpha, mat, vecb, + beta, vecc, alg, &buffer_size); + + gko::Array buffer_array(exec, buffer_size); + auto buffer = buffer_array.get_data(); + cusparse::spmv(handle, trans, alpha, mat, vecb, beta, vecc, + alg, buffer); + cusparse::destroy(vecb); + cusparse::destroy(vecc); + } else { + cusparseSpMMAlg_t alg = CUSPARSE_SPMM_CSR_ALG2; + auto vecb = + cusparse::create_dnmat(b->get_size(), b->get_stride(), b_val); + auto vecc = + cusparse::create_dnmat(c->get_size(), c->get_stride(), c_val); + size_type buffer_size = 0; + cusparse::spmm_buffersize(handle, trans, trans, alpha, mat, + vecb, beta, vecc, alg, + &buffer_size); + + gko::Array buffer_array(exec, buffer_size); + auto buffer = buffer_array.get_data(); + cusparse::spmm(handle, trans, trans, alpha, mat, vecb, beta, + vecc, alg, buffer); + cusparse::destroy(vecb); + cusparse::destroy(vecc); + } + cusparse::destroy(mat); +#endif + return true; +} + + +template +bool try_sparselib_spmv(std::shared_ptr exec, + const matrix::Csr* a, + const matrix::Dense* b, + matrix::Dense* c, + const matrix::Dense* alpha = nullptr, + const matrix::Dense* beta = nullptr) +{ + if (alpha) { + return try_general_sparselib_spmv(exec, alpha->get_const_values(), a, b, + beta->get_const_values(), c); + } else { + auto handle = exec->get_cusparse_handle(); + cusparse::pointer_mode_guard pm_guard(handle); + const auto valpha = one(); + const auto vbeta = zero(); + return try_general_sparselib_spmv(exec, &valpha, a, b, &vbeta, c); + } +} + + +} // anonymous namespace +} // namespace host_kernel + + +template +void spmv(std::shared_ptr exec, + const matrix::Csr* a, + const matrix::Dense* b, matrix::Dense* c) +{ + if (c->get_size()[0] * c->get_size()[1] == 0) { + // empty output: nothing to do + } else if (a->get_strategy()->get_name() == "load_balance") { + host_kernel::load_balance_spmv(exec, a, b, c); } else if (a->get_strategy()->get_name() == "merge_path") { int items_per_thread = host_kernel::compute_items_per_thread(exec); @@ -282,83 +406,37 @@ void spmv(std::shared_ptr exec, return items_per_thread == compiled_info; }, syn::value_list(), syn::type_list<>(), exec, a, b, c); - } else if (a->get_strategy()->get_name() == "classical") { - IndexType max_length_per_row = 0; - using Tcsr = matrix::Csr; - if (auto strategy = - std::dynamic_pointer_cast( - a->get_strategy())) { - max_length_per_row = strategy->get_max_length_per_row(); - } else if (auto strategy = std::dynamic_pointer_cast< - const typename Tcsr::automatical>(a->get_strategy())) { - max_length_per_row = strategy->get_max_length_per_row(); - } else { - GKO_NOT_SUPPORTED(a->get_strategy()); + } else { + bool use_classical = true; + if (a->get_strategy()->get_name() == "sparselib" || + a->get_strategy()->get_name() == "cusparse") { + use_classical = !host_kernel::try_sparselib_spmv(exec, a, b, c); } - host_kernel::select_classical_spmv( - classical_kernels(), - [&max_length_per_row](int compiled_info) { - return max_length_per_row >= compiled_info; - }, - syn::value_list(), syn::type_list<>(), exec, a, b, c); - } else if (a->get_strategy()->get_name() == "sparselib" || - a->get_strategy()->get_name() == "cusparse") { - auto handle = exec->get_cusparse_handle(); - { - cusparse::pointer_mode_guard pm_guard(handle); - const auto alpha = one(); - const auto beta = zero(); - // TODO: add implementation for int64 and multiple RHS - if (b->get_stride() != 1 || c->get_stride() != 1) - GKO_NOT_IMPLEMENTED; - -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - // TODO: add implementation for int64 and multiple RHS - if (!cusparse::is_supported::value) { - GKO_NOT_IMPLEMENTED; + if (use_classical) { + IndexType max_length_per_row = 0; + using Tcsr = matrix::Csr; + if (auto strategy = + std::dynamic_pointer_cast( + a->get_strategy())) { + max_length_per_row = strategy->get_max_length_per_row(); + } else if (auto strategy = std::dynamic_pointer_cast< + const typename Tcsr::automatical>( + a->get_strategy())) { + max_length_per_row = strategy->get_max_length_per_row(); + } else { + // as a fall-back: use average row length, at least 1 + max_length_per_row = std::max( + a->get_num_stored_elements() / + std::max(a->get_size()[0], 1), + 1); } - - auto descr = cusparse::create_mat_descr(); - auto row_ptrs = a->get_const_row_ptrs(); - auto col_idxs = a->get_const_col_idxs(); - cusparse::spmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, - a->get_size()[0], a->get_size()[1], - a->get_num_stored_elements(), &alpha, descr, - a->get_const_values(), row_ptrs, col_idxs, - b->get_const_values(), &beta, c->get_values()); - - cusparse::destroy(descr); -#else // CUDA_VERSION >= 11000 - cusparseOperation_t trans = CUSPARSE_OPERATION_NON_TRANSPOSE; - cusparseSpMVAlg_t alg = CUSPARSE_CSRMV_ALG1; - auto row_ptrs = const_cast(a->get_const_row_ptrs()); - auto col_idxs = const_cast(a->get_const_col_idxs()); - auto values = const_cast(a->get_const_values()); - auto mat = cusparse::create_csr(a->get_size()[0], a->get_size()[1], - a->get_num_stored_elements(), - row_ptrs, col_idxs, values); - auto b_val = const_cast(b->get_const_values()); - auto c_val = c->get_values(); - auto vecb = - cusparse::create_dnvec(b->get_num_stored_elements(), b_val); - auto vecc = - cusparse::create_dnvec(c->get_num_stored_elements(), c_val); - size_type buffer_size = 0; - cusparse::spmv_buffersize(handle, trans, &alpha, mat, - vecb, &beta, vecc, alg, - &buffer_size); - - gko::Array buffer_array(exec, buffer_size); - auto buffer = buffer_array.get_data(); - cusparse::spmv(handle, trans, &alpha, mat, vecb, &beta, - vecc, alg, buffer); - cusparse::destroy(vecb); - cusparse::destroy(vecc); - cusparse::destroy(mat); -#endif + host_kernel::select_classical_spmv( + classical_kernels(), + [&max_length_per_row](int compiled_info) { + return max_length_per_row >= compiled_info; + }, + syn::value_list(), syn::type_list<>(), exec, a, b, c); } - } else { - GKO_NOT_IMPLEMENTED; } } @@ -373,94 +451,10 @@ void advanced_spmv(std::shared_ptr exec, const matrix::Dense* beta, matrix::Dense* c) { - if (a->get_strategy()->get_name() == "load_balance") { - dense::scale(exec, beta, c); - - const IndexType nwarps = a->get_num_srow_elements(); - - if (nwarps > 0) { - const dim3 csr_block(config::warp_size, warps_in_block, 1); - const dim3 csr_grid(ceildiv(nwarps, warps_in_block), - b->get_size()[1]); - kernel::abstract_spmv<<>>( - nwarps, static_cast(a->get_size()[0]), - as_cuda_type(alpha->get_const_values()), - as_cuda_type(a->get_const_values()), a->get_const_col_idxs(), - as_cuda_type(a->get_const_row_ptrs()), - as_cuda_type(a->get_const_srow()), - as_cuda_type(b->get_const_values()), - as_cuda_type(b->get_stride()), as_cuda_type(c->get_values()), - as_cuda_type(c->get_stride())); - } else { - GKO_NOT_SUPPORTED(nwarps); - } - } else if (a->get_strategy()->get_name() == "sparselib" || - a->get_strategy()->get_name() == "cusparse") { - // TODO: add implementation for int64 and multiple RHS - if (b->get_stride() != 1 || c->get_stride() != 1) GKO_NOT_IMPLEMENTED; - -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - if (!cusparse::is_supported::value) { - GKO_NOT_IMPLEMENTED; - } - - auto descr = cusparse::create_mat_descr(); - auto row_ptrs = a->get_const_row_ptrs(); - auto col_idxs = a->get_const_col_idxs(); - cusparse::spmv(exec->get_cusparse_handle(), - CUSPARSE_OPERATION_NON_TRANSPOSE, a->get_size()[0], - a->get_size()[1], a->get_num_stored_elements(), - alpha->get_const_values(), descr, a->get_const_values(), - row_ptrs, col_idxs, b->get_const_values(), - beta->get_const_values(), c->get_values()); - - cusparse::destroy(descr); -#else // CUDA_VERSION >= 11000 - cusparseOperation_t trans = CUSPARSE_OPERATION_NON_TRANSPOSE; - cusparseSpMVAlg_t alg = CUSPARSE_CSRMV_ALG1; - auto row_ptrs = const_cast(a->get_const_row_ptrs()); - auto col_idxs = const_cast(a->get_const_col_idxs()); - auto values = const_cast(a->get_const_values()); - auto mat = cusparse::create_csr(a->get_size()[0], a->get_size()[1], - a->get_num_stored_elements(), row_ptrs, - col_idxs, values); - auto b_val = const_cast(b->get_const_values()); - auto c_val = c->get_values(); - auto vecb = cusparse::create_dnvec(b->get_num_stored_elements(), b_val); - auto vecc = cusparse::create_dnvec(c->get_num_stored_elements(), c_val); - size_type buffer_size = 0; - cusparse::spmv_buffersize( - exec->get_cusparse_handle(), trans, alpha->get_const_values(), mat, - vecb, beta->get_const_values(), vecc, alg, &buffer_size); - gko::Array buffer_array(exec, buffer_size); - auto buffer = buffer_array.get_data(); - cusparse::spmv(exec->get_cusparse_handle(), trans, - alpha->get_const_values(), mat, vecb, - beta->get_const_values(), vecc, alg, buffer); - cusparse::destroy(vecb); - cusparse::destroy(vecc); - cusparse::destroy(mat); -#endif - } else if (a->get_strategy()->get_name() == "classical") { - IndexType max_length_per_row = 0; - using Tcsr = matrix::Csr; - if (auto strategy = - std::dynamic_pointer_cast( - a->get_strategy())) { - max_length_per_row = strategy->get_max_length_per_row(); - } else if (auto strategy = std::dynamic_pointer_cast< - const typename Tcsr::automatical>(a->get_strategy())) { - max_length_per_row = strategy->get_max_length_per_row(); - } else { - GKO_NOT_SUPPORTED(a->get_strategy()); - } - host_kernel::select_classical_spmv( - classical_kernels(), - [&max_length_per_row](int compiled_info) { - return max_length_per_row >= compiled_info; - }, - syn::value_list(), syn::type_list<>(), exec, a, b, c, alpha, - beta); + if (c->get_size()[0] * c->get_size()[1] == 0) { + // empty output: nothing to do + } else if (a->get_strategy()->get_name() == "load_balance") { + host_kernel::load_balance_spmv(exec, a, b, c, alpha, beta); } else if (a->get_strategy()->get_name() == "merge_path") { int items_per_thread = host_kernel::compute_items_per_thread(exec); @@ -472,7 +466,38 @@ void advanced_spmv(std::shared_ptr exec, syn::value_list(), syn::type_list<>(), exec, a, b, c, alpha, beta); } else { - GKO_NOT_IMPLEMENTED; + bool use_classical = true; + if (a->get_strategy()->get_name() == "sparselib" || + a->get_strategy()->get_name() == "cusparse") { + use_classical = + !host_kernel::try_sparselib_spmv(exec, a, b, c, alpha, beta); + } + if (use_classical) { + IndexType max_length_per_row = 0; + using Tcsr = matrix::Csr; + if (auto strategy = + std::dynamic_pointer_cast( + a->get_strategy())) { + max_length_per_row = strategy->get_max_length_per_row(); + } else if (auto strategy = std::dynamic_pointer_cast< + const typename Tcsr::automatical>( + a->get_strategy())) { + max_length_per_row = strategy->get_max_length_per_row(); + } else { + // as a fall-back: use average row length, at least 1 + max_length_per_row = std::max( + a->get_num_stored_elements() / + std::max(a->get_size()[0], 1), + 1); + } + host_kernel::select_classical_spmv( + classical_kernels(), + [&max_length_per_row](int compiled_info) { + return max_length_per_row >= compiled_info; + }, + syn::value_list(), syn::type_list<>(), exec, a, b, c, + alpha, beta); + } } } diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh index 2b10b625382..65c71fbae3d 100644 --- a/cuda/solver/common_trs_kernels.cuh +++ b/cuda/solver/common_trs_kernels.cuh @@ -113,10 +113,10 @@ struct CudaSolveStruct : gko::solver::SolveStruct { // workaround suggested by NVIDIA engineers: for some reason // cusparse needs non-nullptr input vectors even for analysis auto descr_b = cusparse::create_dnmat( - matrix->get_size()[0], num_rhs, matrix->get_size()[1], + dim<2>{matrix->get_size()[0], num_rhs}, matrix->get_size()[1], reinterpret_cast(0xDEAD)); auto descr_c = cusparse::create_dnmat( - matrix->get_size()[0], num_rhs, matrix->get_size()[1], + dim<2>{matrix->get_size()[0], num_rhs}, matrix->get_size()[1], reinterpret_cast(0xDEAF)); auto work_size = cusparse::spsm_buffer_size( @@ -143,11 +143,10 @@ struct CudaSolveStruct : gko::solver::SolveStruct { { cusparse::pointer_mode_guard pm_guard(handle); auto descr_b = cusparse::create_dnmat( - input->get_size()[0], input->get_size()[1], input->get_stride(), + input->get_size(), input->get_stride(), const_cast(input->get_const_values())); - auto descr_c = - cusparse::create_dnmat(output->get_size()[0], output->get_size()[1], - output->get_stride(), output->get_values()); + auto descr_c = cusparse::create_dnmat( + output->get_size(), output->get_stride(), output->get_values()); cusparse::spsm_solve(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, one(), From 04b404c1156af244ecf05c8af85237dd57fdc62c Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 18 Oct 2021 15:24:36 +0200 Subject: [PATCH 06/32] fix Csr initialization of empty load_balance mtx --- include/ginkgo/core/matrix/csr.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/ginkgo/core/matrix/csr.hpp b/include/ginkgo/core/matrix/csr.hpp index 5a4a3768920..c325bfbab65 100644 --- a/include/ginkgo/core/matrix/csr.hpp +++ b/include/ginkgo/core/matrix/csr.hpp @@ -433,10 +433,12 @@ class Csr : public EnableLinOp>, } const auto num_rows = mtx_row_ptrs.get_num_elems() - 1; const auto num_elems = row_ptrs[num_rows]; + const auto bucket_divider = + num_elems > 0 ? ceildiv(num_elems, warp_size_) : 1; for (size_type i = 0; i < num_rows; i++) { auto bucket = ceildiv((ceildiv(row_ptrs[i + 1], warp_size_) * nwarps), - ceildiv(num_elems, warp_size_)); + bucket_divider); if (bucket < nwarps) { srow[bucket]++; } From efd83d99d0ff8e7932b81cdc3536a804dc8ad364 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sat, 27 Nov 2021 22:44:33 +0100 Subject: [PATCH 07/32] add is_zero/is_nonzero helpers --- common/cuda_hip/components/warp_blas.hpp.inc | 4 +-- .../solver/common_gmres_kernels.hpp.inc | 2 +- common/unified/solver/cgs_kernels.cpp | 4 +-- common/unified/solver/fcg_kernels.cpp | 2 +- core/matrix/dense.cpp | 2 +- cuda/components/device_matrix_data_kernels.cu | 6 ++-- .../device_matrix_data_kernels.dp.cpp | 11 ++++--- dpcpp/solver/common_gmres_kernels.dp.inc | 2 +- .../device_matrix_data_kernels.hip.cpp | 6 ++-- include/ginkgo/core/base/math.hpp | 30 +++++++++++++++++++ include/ginkgo/core/base/matrix_data.hpp | 19 ++++++------ omp/components/device_matrix_data_kernels.cpp | 4 +-- omp/preconditioner/jacobi_kernels.cpp | 4 +-- omp/solver/cb_gmres_kernels.cpp | 2 +- omp/solver/gmres_kernels.cpp | 2 +- .../components/device_matrix_data_kernels.cpp | 8 ++--- reference/preconditioner/jacobi_kernels.cpp | 6 ++-- reference/solver/bicg_kernels.cpp | 4 +-- reference/solver/bicgstab_kernels.cpp | 6 ++-- reference/solver/cb_gmres_kernels.cpp | 2 +- reference/solver/cg_kernels.cpp | 4 +-- reference/solver/cgs_kernels.cpp | 4 +-- reference/solver/fcg_kernels.cpp | 4 +-- reference/solver/gmres_kernels.cpp | 2 +- 24 files changed, 82 insertions(+), 58 deletions(-) diff --git a/common/cuda_hip/components/warp_blas.hpp.inc b/common/cuda_hip/components/warp_blas.hpp.inc index 2fb3f5f8a3e..2e3d82e99b4 100644 --- a/common/cuda_hip/components/warp_blas.hpp.inc +++ b/common/cuda_hip/components/warp_blas.hpp.inc @@ -60,7 +60,7 @@ __device__ __forceinline__ void apply_gauss_jordan_transform( ValueType* __restrict__ row, bool& __restrict__ status) { auto key_col_elem = group.shfl(row[key_col], key_row); - if (key_col_elem == zero()) { + if (is_zero(key_col_elem)) { // TODO: implement error handling for GPUs to be able to properly // report it here status = false; @@ -105,7 +105,7 @@ __device__ __forceinline__ void apply_gauss_jordan_transform_with_rhs( { auto key_col_elem = group.shfl(row[key_col], key_row); auto key_rhs_elem = group.shfl(rhs[0], key_row); - if (key_col_elem == zero()) { + if (is_zero(key_col_elem)) { // TODO: implement error handling for GPUs to be able to properly // report it here status = false; diff --git a/common/cuda_hip/solver/common_gmres_kernels.hpp.inc b/common/cuda_hip/solver/common_gmres_kernels.hpp.inc index 00ecfcd17af..3be1c712cd0 100644 --- a/common/cuda_hip/solver/common_gmres_kernels.hpp.inc +++ b/common/cuda_hip/solver/common_gmres_kernels.hpp.inc @@ -72,7 +72,7 @@ __device__ void calculate_sin_and_cos_kernel( ValueType* givens_sin, size_type stride_sin, ValueType* givens_cos, size_type stride_cos, ValueType& register_sin, ValueType& register_cos) { - if (this_hess == zero()) { + if (is_zero(this_hess)) { register_cos = zero(); register_sin = one(); } else { diff --git a/common/unified/solver/cgs_kernels.cpp b/common/unified/solver/cgs_kernels.cpp index cf849fb9459..737460b2769 100644 --- a/common/unified/solver/cgs_kernels.cpp +++ b/common/unified/solver/cgs_kernels.cpp @@ -102,7 +102,7 @@ void step_1(std::shared_ptr exec, [] GKO_KERNEL(auto row, auto col, auto r, auto u, auto p, auto q, auto beta, auto rho, auto prev_rho, auto stop) { if (!stop[col].has_stopped()) { - auto prev_rho_zero = prev_rho[col] == zero(prev_rho[col]); + auto prev_rho_zero = is_zero(prev_rho[col]); auto tmp = prev_rho_zero ? beta[col] : rho[col] / prev_rho[col]; if (row == 0 && !prev_rho_zero) { beta[col] = tmp; @@ -134,7 +134,7 @@ void step_2(std::shared_ptr exec, [] GKO_KERNEL(auto row, auto col, auto u, auto v_hat, auto q, auto t, auto alpha, auto rho, auto gamma, auto stop) { if (!stop[col].has_stopped()) { - auto gamma_is_zero = gamma[col] == zero(gamma[col]); + auto gamma_is_zero = is_zero(gamma[col]); auto tmp = gamma_is_zero ? alpha[col] : rho[col] / gamma[col]; if (row == 0 && !gamma_is_zero) { alpha[col] = tmp; diff --git a/common/unified/solver/fcg_kernels.cpp b/common/unified/solver/fcg_kernels.cpp index 40d81450d19..f14c6cc1467 100644 --- a/common/unified/solver/fcg_kernels.cpp +++ b/common/unified/solver/fcg_kernels.cpp @@ -117,7 +117,7 @@ void step_2(std::shared_ptr exec, exec, [] GKO_KERNEL(auto row, auto col, auto x, auto r, auto t, auto p, auto q, auto beta, auto rho, auto stop) { - if (!stop[col].has_stopped() && beta[col] != zero(beta[col])) { + if (!stop[col].has_stopped() && is_nonzero(beta[col])) { auto tmp = rho[col] / beta[col]; auto prev_r = r(row, col); x(row, col) += tmp * p(row, col); diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp index 8388dd58f09..ee38b91f43e 100644 --- a/core/matrix/dense.cpp +++ b/core/matrix/dense.cpp @@ -729,7 +729,7 @@ inline void write_impl(const MatrixType* mtx, MatrixData& data) for (size_type row = 0; row < data.size[0]; ++row) { for (size_type col = 0; col < data.size[1]; ++col) { - if (tmp->at(row, col) != zero()) { + if (is_nonzero(tmp->at(row, col))) { data.nonzeros.emplace_back(row, col, tmp->at(row, col)); } } diff --git a/cuda/components/device_matrix_data_kernels.cu b/cuda/components/device_matrix_data_kernels.cu index 233a1c6667a..96845d35e85 100644 --- a/cuda/components/device_matrix_data_kernels.cu +++ b/cuda/components/device_matrix_data_kernels.cu @@ -62,9 +62,7 @@ void remove_zeros(std::shared_ptr exec, auto nnz = thrust::count_if( thrust::device_pointer_cast(as_cuda_type(data.get_const_data())), thrust::device_pointer_cast(as_cuda_type(data.get_const_data() + size)), - [] __device__(nonzero_type entry) { - return entry.value != zero(entry.value); - }); + [] __device__(nonzero_type entry) { return is_nonzero(entry.value); }); if (nnz < size) { Array> result{ exec, static_cast(nnz)}; @@ -75,7 +73,7 @@ void remove_zeros(std::shared_ptr exec, as_cuda_type(data.get_const_data() + size)), thrust::device_pointer_cast(as_cuda_type(result.get_data())), [] __device__(nonzero_type entry) { - return entry.value != zero(entry.value); + return is_nonzero(entry.value); }); data = std::move(result); } diff --git a/dpcpp/components/device_matrix_data_kernels.dp.cpp b/dpcpp/components/device_matrix_data_kernels.dp.cpp index fcf640c3374..e3c80a00b15 100644 --- a/dpcpp/components/device_matrix_data_kernels.dp.cpp +++ b/dpcpp/components/device_matrix_data_kernels.dp.cpp @@ -56,14 +56,13 @@ void remove_zeros(std::shared_ptr exec, oneapi::dpl::execution::make_device_policy(*exec->get_queue()); auto nnz = std::count_if( policy, data.get_const_data(), data.get_const_data() + size, - [](nonzero_type entry) { return entry.value != zero(); }); + [](nonzero_type entry) { return is_nonzero(entry.value); }); if (nnz < size) { Array result{exec, static_cast(nnz)}; - std::copy_if(policy, data.get_const_data(), - data.get_const_data() + size, result.get_data(), - [](nonzero_type entry) { - return entry.value != zero(); - }); + std::copy_if( + policy, data.get_const_data(), data.get_const_data() + size, + result.get_data(), + [](nonzero_type entry) { return is_nonzero(entry.value); }); data = std::move(result); } } diff --git a/dpcpp/solver/common_gmres_kernels.dp.inc b/dpcpp/solver/common_gmres_kernels.dp.inc index e651f0d79cc..b850e60f09b 100644 --- a/dpcpp/solver/common_gmres_kernels.dp.inc +++ b/dpcpp/solver/common_gmres_kernels.dp.inc @@ -96,7 +96,7 @@ void calculate_sin_and_cos_kernel(size_type col_idx, size_type num_cols, ValueType ®ister_sin, ValueType ®ister_cos) { - if (this_hess == zero()) { + if (is_zero(this_hess)) { register_cos = zero(); register_sin = one(); } else { diff --git a/hip/components/device_matrix_data_kernels.hip.cpp b/hip/components/device_matrix_data_kernels.hip.cpp index 1b83333f196..820bd76a491 100644 --- a/hip/components/device_matrix_data_kernels.hip.cpp +++ b/hip/components/device_matrix_data_kernels.hip.cpp @@ -62,9 +62,7 @@ void remove_zeros(std::shared_ptr exec, auto nnz = thrust::count_if( thrust::device_pointer_cast(as_hip_type(data.get_const_data())), thrust::device_pointer_cast(as_hip_type(data.get_const_data() + size)), - [] __device__(nonzero_type entry) { - return entry.value != zero(entry.value); - }); + [] __device__(nonzero_type entry) { return is_nonzero(entry.value); }); if (nnz < size) { Array> result{ exec, static_cast(nnz)}; @@ -75,7 +73,7 @@ void remove_zeros(std::shared_ptr exec, as_hip_type(data.get_const_data() + size)), thrust::device_pointer_cast(as_hip_type(result.get_data())), [] __device__(nonzero_type entry) { - return entry.value != zero(entry.value); + return is_nonzero(entry.value); }); data = std::move(result); } diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp index 509a3f600ce..9f237603bfa 100644 --- a/include/ginkgo/core/base/math.hpp +++ b/include/ginkgo/core/base/math.hpp @@ -823,6 +823,36 @@ GKO_INLINE GKO_ATTRIBUTES constexpr T one(const T&) #undef GKO_BIND_ZERO_ONE +/** + * Returns true if and only if the given value is zero. + * + * @tparam T the type of the value + * + * @param value the given value + * @return true iff the given value is zero, i.e. `value == zero()` + */ +template +GKO_INLINE GKO_ATTRIBUTES constexpr bool is_zero(T value) +{ + return value == zero(); +} + + +/** + * Returns true if and only if the given value is not zero. + * + * @tparam T the type of the value + * + * @param value the given value + * @return true iff the given value is not zero, i.e. `value != zero()` + */ +template +GKO_INLINE GKO_ATTRIBUTES constexpr bool is_nonzero(T value) +{ + return value != zero(); +} + + /** * Returns the larger of the arguments. * diff --git a/include/ginkgo/core/base/matrix_data.hpp b/include/ginkgo/core/base/matrix_data.hpp index 05655a44fdf..e4b88ef76f3 100644 --- a/include/ginkgo/core/base/matrix_data.hpp +++ b/include/ginkgo/core/base/matrix_data.hpp @@ -166,7 +166,7 @@ struct matrix_data { matrix_data(dim<2> size_ = dim<2>{}, ValueType value = zero()) : size{size_} { - if (value == zero()) { + if (is_zero(value)) { return; } for (size_type row = 0; row < size[0]; ++row) { @@ -194,7 +194,7 @@ struct matrix_data { for (size_type col = 0; col < size[1]; ++col) { const auto value = detail::get_rand_value(dist, engine); - if (value != zero()) { + if (is_nonzero(value)) { nonzeros.emplace_back(row, col, value); } } @@ -214,7 +214,7 @@ struct matrix_data { size[1] = std::max(size[1], row_data.size()); for (size_type col = 0; col < row_data.size(); ++col) { const auto& val = begin(row_data)[col]; - if (val != zero()) { + if (is_nonzero(val)) { nonzeros.emplace_back(row, col, val); } } @@ -274,7 +274,7 @@ struct matrix_data { { for (gko::size_type row = 0; row < size[0]; ++row) { for (gko::size_type col = 0; col < size[1]; ++col) { - if (data(row, col) != zero()) { + if (is_nonzero(data(row, col))) { nonzeros.emplace_back(row, col, data(row, col)); } } @@ -292,7 +292,7 @@ struct matrix_data { static matrix_data diag(dim<2> size_, ValueType value) { matrix_data res(size_); - if (value != zero()) { + if (is_nonzero(value)) { const auto num_nnz = std::min(size_[0], size_[1]); res.nonzeros.reserve(num_nnz); for (size_type i = 0; i < num_nnz; ++i) { @@ -492,11 +492,10 @@ struct matrix_data { void remove_zeros() { - nonzeros.erase(std::remove_if(begin(nonzeros), end(nonzeros), - [](nonzero_type nz) { - return nz.value == zero(); - }), - end(nonzeros)); + nonzeros.erase( + std::remove_if(begin(nonzeros), end(nonzeros), + [](nonzero_type nz) { return is_zero(nz.value); }), + end(nonzeros)); } private: diff --git a/omp/components/device_matrix_data_kernels.cpp b/omp/components/device_matrix_data_kernels.cpp index 5278f3567d8..14cdab5dca6 100644 --- a/omp/components/device_matrix_data_kernels.cpp +++ b/omp/components/device_matrix_data_kernels.cpp @@ -63,7 +63,7 @@ void remove_zeros(std::shared_ptr exec, const auto end = std::min(size, begin + per_thread); for (auto i = begin; i < end; i++) { partial_counts[tidx] += - data.get_const_data()[i].value != zero() ? 1 : 0; + is_nonzero(data.get_const_data()[i].value) ? 1 : 0; } } std::partial_sum(partial_counts.begin(), partial_counts.end(), @@ -80,7 +80,7 @@ void remove_zeros(std::shared_ptr exec, auto out_idx = tidx == 0 ? size_type{} : partial_counts[tidx - 1]; for (auto i = begin; i < end; i++) { auto entry = data.get_const_data()[i]; - if (entry.value != zero()) { + if (is_nonzero(entry.value)) { result.get_data()[out_idx] = entry; out_idx++; } diff --git a/omp/preconditioner/jacobi_kernels.cpp b/omp/preconditioner/jacobi_kernels.cpp index 36ad402157b..8befa904186 100644 --- a/omp/preconditioner/jacobi_kernels.cpp +++ b/omp/preconditioner/jacobi_kernels.cpp @@ -228,7 +228,7 @@ inline bool apply_gauss_jordan_transform(IndexType row, IndexType col, size_type stride) { const auto d = block[row * stride + col]; - if (d == zero()) { + if (is_zero(d)) { return false; } for (IndexType i = 0; i < block_size; ++i) { @@ -482,7 +482,7 @@ inline void apply_block(size_type block_size, size_type num_rhs, ValueType beta, ValueType* x, size_type stride_x, ValueConverter converter = {}) { - if (beta != zero()) { + if (is_nonzero(beta)) { for (size_type row = 0; row < block_size; ++row) { for (size_type col = 0; col < num_rhs; ++col) { x[row * stride_x + col] *= beta; diff --git a/omp/solver/cb_gmres_kernels.cpp b/omp/solver/cb_gmres_kernels.cpp index 17b44aa9187..c7fac0a060b 100644 --- a/omp/solver/cb_gmres_kernels.cpp +++ b/omp/solver/cb_gmres_kernels.cpp @@ -194,7 +194,7 @@ void calculate_sin_and_cos(matrix::Dense* givens_sin, matrix::Dense* hessenberg_iter, size_type iter, const size_type rhs) { - if (hessenberg_iter->at(iter, rhs) == zero()) { + if (is_zero(hessenberg_iter->at(iter, rhs))) { givens_cos->at(iter, rhs) = zero(); givens_sin->at(iter, rhs) = one(); } else { diff --git a/omp/solver/gmres_kernels.cpp b/omp/solver/gmres_kernels.cpp index e05a0ca2033..0ad1d82736a 100644 --- a/omp/solver/gmres_kernels.cpp +++ b/omp/solver/gmres_kernels.cpp @@ -118,7 +118,7 @@ void calculate_sin_and_cos(matrix::Dense* givens_sin, matrix::Dense* hessenberg_iter, size_type iter, const size_type rhs) { - if (hessenberg_iter->at(iter, rhs) == zero()) { + if (is_zero(hessenberg_iter->at(iter, rhs))) { givens_cos->at(iter, rhs) = zero(); givens_sin->at(iter, rhs) = one(); } else { diff --git a/reference/components/device_matrix_data_kernels.cpp b/reference/components/device_matrix_data_kernels.cpp index 88b24a52464..11e9f836bb4 100644 --- a/reference/components/device_matrix_data_kernels.cpp +++ b/reference/components/device_matrix_data_kernels.cpp @@ -50,16 +50,16 @@ void remove_zeros(std::shared_ptr exec, Array>& data) { auto size = data.get_num_elems(); - auto is_nonzero = [](matrix_data_entry entry) { - return entry.value != zero(); + auto is_nonzero_entry = [](matrix_data_entry entry) { + return is_nonzero(entry.value); }; auto nnz = std::count_if(data.get_const_data(), - data.get_const_data() + size, is_nonzero); + data.get_const_data() + size, is_nonzero_entry); if (nnz < size) { Array> result{ exec, static_cast(nnz)}; std::copy_if(data.get_const_data(), data.get_const_data() + size, - result.get_data(), is_nonzero); + result.get_data(), is_nonzero_entry); data = std::move(result); } } diff --git a/reference/preconditioner/jacobi_kernels.cpp b/reference/preconditioner/jacobi_kernels.cpp index 365e9286178..a3e3ecb41da 100644 --- a/reference/preconditioner/jacobi_kernels.cpp +++ b/reference/preconditioner/jacobi_kernels.cpp @@ -211,7 +211,7 @@ inline bool apply_gauss_jordan_transform(IndexType row, IndexType col, size_type stride) { const auto d = block[row * stride + col]; - if (d == zero()) { + if (is_zero(d)) { return false; } for (IndexType i = 0; i < block_size; ++i) { @@ -451,7 +451,7 @@ inline void apply_block(size_type block_size, size_type num_rhs, ValueType beta, ValueType* x, size_type stride_x, ValueConverter converter = {}) { - if (beta != zero()) { + if (is_nonzero(beta)) { for (size_type row = 0; row < block_size; ++row) { for (size_type col = 0; col < num_rhs; ++col) { x[row * stride_x + col] *= beta; @@ -613,7 +613,7 @@ void invert_diagonal(std::shared_ptr exec, const Array& diag, Array& inv_diag) { for (size_type i = 0; i < diag.get_num_elems(); ++i) { - auto diag_val = diag.get_const_data()[i] == zero() + auto diag_val = is_zero(diag.get_const_data()[i]) ? one() : diag.get_const_data()[i]; inv_diag.get_data()[i] = one() / diag_val; diff --git a/reference/solver/bicg_kernels.cpp b/reference/solver/bicg_kernels.cpp index 8eed6600624..17af2c12ca2 100644 --- a/reference/solver/bicg_kernels.cpp +++ b/reference/solver/bicg_kernels.cpp @@ -91,7 +91,7 @@ void step_1(std::shared_ptr exec, if (stop_status->get_const_data()[j].has_stopped()) { continue; } - if (prev_rho->at(j) == zero()) { + if (is_zero(prev_rho->at(j))) { p->at(i, j) = z->at(i, j); p2->at(i, j) = z2->at(i, j); } else { @@ -121,7 +121,7 @@ void step_2(std::shared_ptr exec, if (stop_status->get_const_data()[j].has_stopped()) { continue; } - if (beta->at(j) != zero()) { + if (is_nonzero(beta->at(j))) { auto tmp = rho->at(j) / beta->at(j); x->at(i, j) += tmp * p->at(i, j); r->at(i, j) -= tmp * q->at(i, j); diff --git a/reference/solver/bicgstab_kernels.cpp b/reference/solver/bicgstab_kernels.cpp index 05f803b00be..1b0b64281c5 100644 --- a/reference/solver/bicgstab_kernels.cpp +++ b/reference/solver/bicgstab_kernels.cpp @@ -105,7 +105,7 @@ void step_1(std::shared_ptr exec, if (stop_status->get_const_data()[j].has_stopped()) { continue; } - if (prev_rho->at(j) * omega->at(j) != zero()) { + if (is_nonzero(prev_rho->at(j) * omega->at(j))) { const auto tmp = rho->at(j) / prev_rho->at(j) * alpha->at(j) / omega->at(j); p->at(i, j) = r->at(i, j) + @@ -134,7 +134,7 @@ void step_2(std::shared_ptr exec, if (stop_status->get_const_data()[j].has_stopped()) { continue; } - if (beta->at(j) != zero()) { + if (is_nonzero(beta->at(j))) { alpha->at(j) = rho->at(j) / beta->at(j); s->at(i, j) = r->at(i, j) - alpha->at(j) * v->at(i, j); } else { @@ -161,7 +161,7 @@ void step_3( if (stop_status->get_const_data()[j].has_stopped()) { continue; } - if (beta->at(j) != zero()) { + if (is_nonzero(beta->at(j))) { omega->at(j) = gamma->at(j) / beta->at(j); } else { omega->at(j) = zero(); diff --git a/reference/solver/cb_gmres_kernels.cpp b/reference/solver/cb_gmres_kernels.cpp index 31374072abc..fae7d83210b 100644 --- a/reference/solver/cb_gmres_kernels.cpp +++ b/reference/solver/cb_gmres_kernels.cpp @@ -181,7 +181,7 @@ void calculate_sin_and_cos(matrix::Dense* givens_sin, matrix::Dense* hessenberg_iter, size_type iter, const size_type rhs) { - if (hessenberg_iter->at(iter, rhs) == zero()) { + if (is_zero(hessenberg_iter->at(iter, rhs))) { givens_cos->at(iter, rhs) = zero(); givens_sin->at(iter, rhs) = one(); } else { diff --git a/reference/solver/cg_kernels.cpp b/reference/solver/cg_kernels.cpp index 846b969ed1d..fc6ddc9b12b 100644 --- a/reference/solver/cg_kernels.cpp +++ b/reference/solver/cg_kernels.cpp @@ -86,7 +86,7 @@ void step_1(std::shared_ptr exec, if (stop_status->get_const_data()[j].has_stopped()) { continue; } - if (prev_rho->at(j) == zero()) { + if (is_zero(prev_rho->at(j))) { p->at(i, j) = z->at(i, j); } else { auto tmp = rho->at(j) / prev_rho->at(j); @@ -113,7 +113,7 @@ void step_2(std::shared_ptr exec, if (stop_status->get_const_data()[j].has_stopped()) { continue; } - if (beta->at(j) != zero()) { + if (is_nonzero(beta->at(j))) { auto tmp = rho->at(j) / beta->at(j); x->at(i, j) += tmp * p->at(i, j); r->at(i, j) -= tmp * q->at(i, j); diff --git a/reference/solver/cgs_kernels.cpp b/reference/solver/cgs_kernels.cpp index 461855cf865..35e35b94788 100644 --- a/reference/solver/cgs_kernels.cpp +++ b/reference/solver/cgs_kernels.cpp @@ -95,7 +95,7 @@ void step_1(std::shared_ptr exec, if (stop_status->get_const_data()[j].has_stopped()) { continue; } - if (rho_prev->at(j) != zero()) { + if (is_nonzero(rho_prev->at(j))) { beta->at(j) = rho->at(j) / rho_prev->at(j); } } @@ -128,7 +128,7 @@ void step_2(std::shared_ptr exec, if (stop_status->get_const_data()[j].has_stopped()) { continue; } - if (gamma->at(j) != zero()) { + if (is_nonzero(gamma->at(j))) { alpha->at(j) = rho->at(j) / gamma->at(j); } } diff --git a/reference/solver/fcg_kernels.cpp b/reference/solver/fcg_kernels.cpp index 1e5a38b2938..d7cd9c809c7 100644 --- a/reference/solver/fcg_kernels.cpp +++ b/reference/solver/fcg_kernels.cpp @@ -87,7 +87,7 @@ void step_1(std::shared_ptr exec, if (stop_status->get_const_data()[j].has_stopped()) { continue; } - if (prev_rho->at(j) == zero()) { + if (is_zero(prev_rho->at(j))) { p->at(i, j) = z->at(i, j); } else { auto tmp = rho_t->at(j) / prev_rho->at(j); @@ -114,7 +114,7 @@ void step_2(std::shared_ptr exec, if (stop_status->get_const_data()[j].has_stopped()) { continue; } - if (beta->at(j) != zero()) { + if (is_nonzero(beta->at(j))) { auto tmp = rho->at(j) / beta->at(j); auto prev_r = r->at(i, j); x->at(i, j) += tmp * p->at(i, j); diff --git a/reference/solver/gmres_kernels.cpp b/reference/solver/gmres_kernels.cpp index 12d3959e646..7b51dccd71d 100644 --- a/reference/solver/gmres_kernels.cpp +++ b/reference/solver/gmres_kernels.cpp @@ -108,7 +108,7 @@ void calculate_sin_and_cos(matrix::Dense* givens_sin, matrix::Dense* hessenberg_iter, size_type iter, const size_type rhs) { - if (hessenberg_iter->at(iter, rhs) == zero()) { + if (is_zero(hessenberg_iter->at(iter, rhs))) { givens_cos->at(iter, rhs) = zero(); givens_sin->at(iter, rhs) = one(); } else { From 5178a81a35ebbea0c126e17a3bdddc3ae5b5be69 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 18 Oct 2021 15:26:23 +0200 Subject: [PATCH 08/32] keep numeric zeros in matrix_data and conversions --- core/matrix/coo.cpp | 6 ++--- core/matrix/hybrid.cpp | 34 ++---------------------- reference/matrix/hybrid_kernels.cpp | 8 +++--- reference/test/matrix/hybrid_kernels.cpp | 5 +--- 4 files changed, 9 insertions(+), 44 deletions(-) diff --git a/core/matrix/coo.cpp b/core/matrix/coo.cpp index 3f4444f12dc..dd145dc3b5d 100644 --- a/core/matrix/coo.cpp +++ b/core/matrix/coo.cpp @@ -163,9 +163,9 @@ template void Coo::move_to(Csr* result) { auto exec = this->get_executor(); - auto tmp = Csr::create( - exec, this->get_size(), this->get_num_stored_elements(), - result->get_strategy()); + const auto nnz = this->get_num_stored_elements(); + auto tmp = Csr::create(exec, this->get_size(), nnz, + result->get_strategy()); tmp->values_ = std::move(this->values_); tmp->col_idxs_ = std::move(this->col_idxs_); exec->run(coo::make_convert_to_csr(this, tmp.get())); diff --git a/core/matrix/hybrid.cpp b/core/matrix/hybrid.cpp index 1a2a4f21c66..67b0bcfe085 100644 --- a/core/matrix/hybrid.cpp +++ b/core/matrix/hybrid.cpp @@ -78,34 +78,6 @@ GKO_REGISTER_OPERATION(outplace_absolute_array, } // namespace hybrid -namespace { - - -template -void get_each_row_nnz(const matrix_data& data, - Array& row_nnz) -{ - size_type nnz = 0; - IndexType current_row = 0; - auto row_nnz_val = row_nnz.get_data(); - for (size_type i = 0; i < row_nnz.get_num_elems(); i++) { - row_nnz_val[i] = zero(); - } - for (const auto& elem : data.nonzeros) { - if (elem.row != current_row) { - row_nnz_val[current_row] = nnz; - current_row = elem.row; - nnz = 0; - } - nnz += (elem.value != zero()); - } - row_nnz_val[current_row] = nnz; -} - - -} // namespace - - template void Hybrid::apply_impl(const LinOp* b, LinOp* x) const { @@ -257,10 +229,8 @@ void Hybrid::write(mat_data& data) const } while (coo_ind < coo_nnz && coo_row_idxs[coo_ind] == row) { - if (coo_vals[coo_ind] != zero()) { - data.nonzeros.emplace_back(row, coo_col_idxs[coo_ind], - coo_vals[coo_ind]); - } + data.nonzeros.emplace_back(row, coo_col_idxs[coo_ind], + coo_vals[coo_ind]); coo_ind++; } } diff --git a/reference/matrix/hybrid_kernels.cpp b/reference/matrix/hybrid_kernels.cpp index 46d4b605001..571e17ad5a4 100644 --- a/reference/matrix/hybrid_kernels.cpp +++ b/reference/matrix/hybrid_kernels.cpp @@ -167,11 +167,9 @@ void convert_to_csr(std::shared_ptr exec, } // Coo part (row should be ascending) while (coo_idx < coo_nnz && coo_row[coo_idx] == row) { - if (coo_val[coo_idx] != zero()) { - csr_val[csr_idx] = coo_val[coo_idx]; - csr_col_idxs[csr_idx] = coo_col[coo_idx]; - csr_idx++; - } + csr_val[csr_idx] = coo_val[coo_idx]; + csr_col_idxs[csr_idx] = coo_col[coo_idx]; + csr_idx++; coo_idx++; } csr_row_ptrs[row + 1] = csr_idx; diff --git a/reference/test/matrix/hybrid_kernels.cpp b/reference/test/matrix/hybrid_kernels.cpp index 02b291976d9..7a26de2a477 100644 --- a/reference/test/matrix/hybrid_kernels.cpp +++ b/reference/test/matrix/hybrid_kernels.cpp @@ -70,7 +70,7 @@ class Hybrid : public ::testing::Test { : exec(gko::ReferenceExecutor::create()), mtx1(Mtx::create(exec)), mtx2(Mtx::create(exec)), - mtx3(Mtx::create(exec, gko::dim<2>{2, 3}, 2, 2, 2)) + mtx3(Mtx::create(exec, gko::dim<2>{2, 3}, 2, 2, 1)) { // clang-format off mtx1 = gko::initialize({{1.0, 3.0, 2.0}, @@ -97,11 +97,8 @@ class Hybrid : public ::testing::Test { ell_col[3] = 1; // Set Coo values coo_val[0] = 2.0; - coo_val[1] = 0.0; coo_col[0] = 2; - coo_col[1] = 2; coo_row[0] = 0; - coo_row[1] = 1; } void assert_equal_to_mtx(const Csr* m) From 4a1d813de066426b0bfb1cddc8c21d331af407cf Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 29 Nov 2021 15:31:46 +0100 Subject: [PATCH 09/32] move row_ptr/row_idx conversion to components --- common/CMakeLists.txt | 3 +- common/cuda_hip/matrix/fbcsr_kernels.hpp.inc | 5 +- .../components/device_matrix_data_kernels.cpp | 34 +---- .../components/format_conversion_kernels.cpp | 119 ++++++++++++++++ .../components/device_matrix_data_kernels.hpp | 20 +-- core/components/format_conversion_kernels.hpp | 100 ++++++++++++++ core/device_hooks/common_kernels.inc.cpp | 23 ++-- core/factorization/par_ic.cpp | 11 +- core/factorization/par_ict.cpp | 9 +- core/factorization/par_ilut.cpp | 13 +- core/matrix/coo.cpp | 11 +- core/matrix/coo_kernels.hpp | 7 - core/matrix/csr.cpp | 6 +- core/matrix/csr_kernels.hpp | 7 - cuda/matrix/coo_kernels.cu | 32 ----- cuda/matrix/csr_kernels.cu | 29 ---- cuda/matrix/fbcsr_kernels.cu | 1 + cuda/matrix/hybrid_kernels.cu | 9 +- dpcpp/components/format_conversion.dp.hpp | 39 ------ dpcpp/matrix/coo_kernels.dp.cpp | 33 ----- dpcpp/matrix/csr_kernels.dp.cpp | 17 --- hip/matrix/coo_kernels.hip.cpp | 33 ----- hip/matrix/csr_kernels.hip.cpp | 30 ---- hip/matrix/fbcsr_kernels.hip.cpp | 1 + hip/matrix/hybrid_kernels.hip.cpp | 10 +- omp/components/format_conversion.hpp | 130 ------------------ omp/matrix/coo_kernels.cpp | 29 ---- omp/matrix/csr_kernels.cpp | 37 +---- omp/matrix/ell_kernels.cpp | 1 - omp/matrix/fbcsr_kernels.cpp | 17 ++- omp/matrix/hybrid_kernels.cpp | 6 +- omp/matrix/sparsity_csr_kernels.cpp | 12 +- reference/CMakeLists.txt | 1 + reference/components/format_conversion.hpp | 76 ---------- .../components/format_conversion_kernels.cpp | 100 ++++++++++++++ reference/matrix/coo_kernels.cpp | 31 +---- reference/matrix/csr_kernels.cpp | 37 ++--- reference/matrix/fbcsr_kernels.cpp | 20 ++- reference/matrix/hybrid_kernels.cpp | 2 +- reference/matrix/sparsity_csr_kernels.cpp | 12 +- 40 files changed, 445 insertions(+), 668 deletions(-) create mode 100644 common/unified/components/format_conversion_kernels.cpp create mode 100644 core/components/format_conversion_kernels.hpp delete mode 100644 omp/components/format_conversion.hpp delete mode 100644 reference/components/format_conversion.hpp create mode 100644 reference/components/format_conversion_kernels.cpp diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index cafffb708f6..523e8f5c68c 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -1,8 +1,9 @@ set(UNIFIED_SOURCES base/index_set_kernels.cpp - components/device_matrix_data_kernels.cpp components/absolute_array_kernels.cpp + components/device_matrix_data_kernels.cpp components/fill_array_kernels.cpp + components/format_conversion_kernels.cpp components/precision_conversion_kernels.cpp components/reduce_array_kernels.cpp distributed/partition_kernels.cpp diff --git a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc b/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc index d6039e339a9..afe6bdcc5a2 100644 --- a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc +++ b/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc @@ -135,8 +135,9 @@ void fill_in_matrix_data( data_ptr[i].column / bs); }); // build row pointers from row indices - components::build_row_ptrs_from_idxs(exec, row_idx_array, num_rows, - row_ptrs); + components::convert_idxs_to_ptrs(exec, row_idx_array.get_const_data(), + row_idx_array.get_num_elems(), num_rows, + row_ptrs); // fill in values components::fill_array(exec, value_array.get_data(), num_blocks * bs * bs, zero()); diff --git a/common/unified/components/device_matrix_data_kernels.cpp b/common/unified/components/device_matrix_data_kernels.cpp index 6d3a7339a06..efe203fb00b 100644 --- a/common/unified/components/device_matrix_data_kernels.cpp +++ b/common/unified/components/device_matrix_data_kernels.cpp @@ -58,7 +58,7 @@ void build_row_ptrs(std::shared_ptr exec, exec, [] GKO_KERNEL(auto i, auto num_nonzeros, auto num_rows, auto nonzeros, auto row_ptrs) { - auto begin_row = i == 0 ? size_type{} : nonzeros[i - 1].row; + auto begin_row = i == 0 ? IndexType{} : nonzeros[i - 1].row; auto end_row = i == num_nonzeros ? num_rows : nonzeros[i].row; for (auto row = begin_row; row < end_row; row++) { row_ptrs[row + 1] = i; @@ -78,38 +78,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DEVICE_MATRIX_DATA_BUILD_ROW_PTRS_KERNEL64); -template -void build_row_ptrs_from_idxs(std::shared_ptr exec, - const Array& row_idxs, - size_type num_rows, RowPtrType* row_ptrs) -{ - if (row_idxs.get_num_elems() == 0) { - fill_array(exec, row_ptrs, num_rows + 1, RowPtrType{}); - } else { - run_kernel( - exec, - [] GKO_KERNEL(auto i, auto num_idxs, auto num_rows, auto row_idxs, - auto row_ptrs) { - auto begin_row = i == 0 ? size_type{} : row_idxs[i - 1]; - auto end_row = i == num_idxs ? num_rows : row_idxs[i]; - for (auto row = begin_row; row < end_row; row++) { - row_ptrs[row + 1] = i; - } - if (i == 0) { - row_ptrs[0] = 0; - } - }, - row_idxs.get_num_elems() + 1, row_idxs.get_num_elems(), num_rows, - row_idxs, row_ptrs); - } -} - -GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( - GKO_DECLARE_DEVICE_MATRIX_DATA_BUILD_ROW_PTRS_FROM_IDXS_KERNEL32); -GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( - GKO_DECLARE_DEVICE_MATRIX_DATA_BUILD_ROW_PTRS_FROM_IDXS_KERNEL64); - - } // namespace components } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels diff --git a/common/unified/components/format_conversion_kernels.cpp b/common/unified/components/format_conversion_kernels.cpp new file mode 100644 index 00000000000..75d5837d34f --- /dev/null +++ b/common/unified/components/format_conversion_kernels.cpp @@ -0,0 +1,119 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/components/format_conversion_kernels.hpp" + + +#include + + +#include "common/unified/base/kernel_launch.hpp" +#include "core/components/fill_array_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace components { + + +template +void convert_ptrs_to_idxs(std::shared_ptr exec, + const RowPtrType* ptrs, size_type num_blocks, + IndexType* idxs) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto block, auto ptrs, auto idxs) { + auto begin = ptrs[block]; + auto end = ptrs[block + 1]; + for (auto i = begin; i < end; i++) { + idxs[i] = block; + } + }, + num_blocks, ptrs, idxs); +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CONVERT_PTRS_TO_IDXS32); +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CONVERT_PTRS_TO_IDXS64); + + +template +void convert_idxs_to_ptrs(std::shared_ptr exec, + const IndexType* idxs, size_type num_idxs, + size_type num_blocks, RowPtrType* ptrs) +{ + if (num_idxs == 0) { + fill_array(exec, ptrs, num_blocks + 1, RowPtrType{}); + } else { + run_kernel( + exec, + [] GKO_KERNEL(auto i, auto num_idxs, auto num_blocks, auto idxs, + auto ptrs) { + auto begin = i == 0 ? IndexType{} : idxs[i - 1]; + auto end = i == num_idxs ? num_blocks : idxs[i]; + for (auto block = begin; block < end; block++) { + ptrs[block + 1] = i; + } + if (i == 0) { + ptrs[0] = 0; + } + }, + num_idxs + 1, num_idxs, num_blocks, idxs, ptrs); + } +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CONVERT_IDXS_TO_PTRS32); +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CONVERT_IDXS_TO_PTRS64); + + +template +void convert_ptrs_to_sizes(std::shared_ptr exec, + const RowPtrType* ptrs, size_type num_blocks, + IndexType* sizes) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto block, auto ptrs, auto sizes) { + sizes[block] = ptrs[block + 1] - ptrs[block]; + }, + num_blocks, ptrs, sizes); +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CONVERT_PTRS_TO_SIZES32); +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CONVERT_PTRS_TO_SIZES64); + + +} // namespace components +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/core/components/device_matrix_data_kernels.hpp b/core/components/device_matrix_data_kernels.hpp index 5df4eb1b6d2..419b7cb2fe2 100644 --- a/core/components/device_matrix_data_kernels.hpp +++ b/core/components/device_matrix_data_kernels.hpp @@ -75,21 +75,6 @@ namespace kernels { GKO_DECLARE_DEVICE_MATRIX_DATA_BUILD_ROW_PTRS_KERNEL(ValueType, IndexType, \ ::gko::int64) -#define GKO_DECLARE_DEVICE_MATRIX_DATA_BUILD_ROW_PTRS_FROM_IDXS_KERNEL( \ - IndexType, RowPtrType) \ - void build_row_ptrs_from_idxs(std::shared_ptr exec, \ - const Array& row_idxs, \ - size_type num_rows, RowPtrType* row_ptrs) - -#define GKO_DECLARE_DEVICE_MATRIX_DATA_BUILD_ROW_PTRS_FROM_IDXS_KERNEL32( \ - IndexType) \ - GKO_DECLARE_DEVICE_MATRIX_DATA_BUILD_ROW_PTRS_FROM_IDXS_KERNEL( \ - IndexType, ::gko::int32) -#define GKO_DECLARE_DEVICE_MATRIX_DATA_BUILD_ROW_PTRS_FROM_IDXS_KERNEL64( \ - IndexType) \ - GKO_DECLARE_DEVICE_MATRIX_DATA_BUILD_ROW_PTRS_FROM_IDXS_KERNEL( \ - IndexType, ::gko::int64) - #define GKO_DECLARE_ALL_AS_TEMPLATES \ template \ @@ -99,10 +84,7 @@ namespace kernels { IndexType); \ template \ GKO_DECLARE_DEVICE_MATRIX_DATA_BUILD_ROW_PTRS_KERNEL(ValueType, IndexType, \ - RowPtrType); \ - template \ - GKO_DECLARE_DEVICE_MATRIX_DATA_BUILD_ROW_PTRS_FROM_IDXS_KERNEL(IndexType, \ - RowPtrType) + RowPtrType) GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(components, diff --git a/core/components/format_conversion_kernels.hpp b/core/components/format_conversion_kernels.hpp new file mode 100644 index 00000000000..8cf17dbe5e1 --- /dev/null +++ b/core/components/format_conversion_kernels.hpp @@ -0,0 +1,100 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CORE_COMPONENTS_FORMAT_CONVERSION_KERNELS_HPP_ +#define GKO_CORE_COMPONENTS_FORMAT_CONVERSION_KERNELS_HPP_ + + +#include + + +#include +#include +#include + + +#include "core/base/kernel_declaration.hpp" + + +namespace gko { +namespace kernels { + + +#define GKO_DECLARE_CONVERT_PTRS_TO_IDXS(IndexType, RowPtrType) \ + void convert_ptrs_to_idxs(std::shared_ptr exec, \ + const RowPtrType* ptrs, size_type num_blocks, \ + IndexType* idxs) +#define GKO_DECLARE_CONVERT_PTRS_TO_IDXS32(IndexType) \ + GKO_DECLARE_CONVERT_PTRS_TO_IDXS(IndexType, ::gko::int32) +#define GKO_DECLARE_CONVERT_PTRS_TO_IDXS64(IndexType) \ + GKO_DECLARE_CONVERT_PTRS_TO_IDXS(IndexType, ::gko::int64) + +#define GKO_DECLARE_CONVERT_IDXS_TO_PTRS(IndexType, RowPtrType) \ + void convert_idxs_to_ptrs(std::shared_ptr exec, \ + const IndexType* idxs, size_type num_idxs, \ + size_type num_blocks, RowPtrType* ptrs) +#define GKO_DECLARE_CONVERT_IDXS_TO_PTRS32(IndexType) \ + GKO_DECLARE_CONVERT_IDXS_TO_PTRS(IndexType, ::gko::int32) +#define GKO_DECLARE_CONVERT_IDXS_TO_PTRS64(IndexType) \ + GKO_DECLARE_CONVERT_IDXS_TO_PTRS(IndexType, ::gko::int64) + +#define GKO_DECLARE_CONVERT_PTRS_TO_SIZES(IndexType, RowPtrType) \ + void convert_ptrs_to_sizes(std::shared_ptr exec, \ + const RowPtrType* ptrs, size_type num_blocks, \ + IndexType* sizes) +#define GKO_DECLARE_CONVERT_PTRS_TO_SIZES32(IndexType) \ + GKO_DECLARE_CONVERT_PTRS_TO_SIZES(IndexType, ::gko::int32) +#define GKO_DECLARE_CONVERT_PTRS_TO_SIZES64(IndexType) \ + GKO_DECLARE_CONVERT_PTRS_TO_SIZES(IndexType, ::gko::int64) + + +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + template \ + GKO_DECLARE_CONVERT_PTRS_TO_IDXS(IndexType, RowPtrType); \ + template \ + GKO_DECLARE_CONVERT_IDXS_TO_PTRS(IndexType, RowPtrType); \ + template \ + GKO_DECLARE_CONVERT_PTRS_TO_SIZES(IndexType, RowPtrType) + + +GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(components, + GKO_DECLARE_ALL_AS_TEMPLATES); + + +#undef GKO_DECLARE_ALL_AS_TEMPLATES + + +} // namespace kernels +} // namespace gko + + +#endif // GKO_CORE_COMPONENTS_FORMAT_CONVERSION_KERNELS_HPP_ diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index 7c7446e38b7..be571393c66 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/components/absolute_array_kernels.hpp" #include "core/components/device_matrix_data_kernels.hpp" #include "core/components/fill_array_kernels.hpp" +#include "core/components/format_conversion_kernels.hpp" #include "core/components/precision_conversion_kernels.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/components/reduce_array_kernels.hpp" @@ -190,14 +191,22 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DEVICE_MATRIX_DATA_BUILD_ROW_PTRS_KERNEL32); GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DEVICE_MATRIX_DATA_BUILD_ROW_PTRS_KERNEL64); + +template +GKO_DECLARE_CONVERT_PTRS_TO_IDXS(IndexType, RowPtrType) +GKO_NOT_COMPILED(GKO_HOOK_MODULE); +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CONVERT_PTRS_TO_IDXS32); +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CONVERT_PTRS_TO_IDXS64); +template +GKO_DECLARE_CONVERT_IDXS_TO_PTRS(IndexType, RowPtrType) +GKO_NOT_COMPILED(GKO_HOOK_MODULE); +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CONVERT_IDXS_TO_PTRS32); +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CONVERT_IDXS_TO_PTRS64); template -GKO_DECLARE_DEVICE_MATRIX_DATA_BUILD_ROW_PTRS_FROM_IDXS_KERNEL(IndexType, - RowPtrType) +GKO_DECLARE_CONVERT_PTRS_TO_SIZES(IndexType, RowPtrType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); -GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( - GKO_DECLARE_DEVICE_MATRIX_DATA_BUILD_ROW_PTRS_FROM_IDXS_KERNEL32); -GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( - GKO_DECLARE_DEVICE_MATRIX_DATA_BUILD_ROW_PTRS_FROM_IDXS_KERNEL64); +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CONVERT_PTRS_TO_SIZES32); +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CONVERT_PTRS_TO_SIZES64); } // namespace components @@ -459,7 +468,6 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_FILL_IN_MATRIX_DATA_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CONVERT_TO_DENSE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CONVERT_TO_COO_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CONVERT_TO_ELL_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CONVERT_TO_SELLP_KERNEL); @@ -524,7 +532,6 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_ADVANCED_SPMV_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_SPMV2_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_ADVANCED_SPMV2_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_FILL_IN_MATRIX_DATA_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_CONVERT_TO_CSR_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_CONVERT_TO_DENSE_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_EXTRACT_DIAGONAL_KERNEL); diff --git a/core/factorization/par_ic.cpp b/core/factorization/par_ic.cpp index 218fe5476ab..e54538ad9e3 100644 --- a/core/factorization/par_ic.cpp +++ b/core/factorization/par_ic.cpp @@ -45,6 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/components/format_conversion_kernels.hpp" #include "core/factorization/factorization_kernels.hpp" #include "core/factorization/par_ic_kernels.hpp" #include "core/matrix/csr_kernels.hpp" @@ -64,7 +65,7 @@ GKO_REGISTER_OPERATION(initialize_l, factorization::initialize_l); GKO_REGISTER_OPERATION(init_factor, par_ic_factorization::init_factor); GKO_REGISTER_OPERATION(compute_factor, par_ic_factorization::compute_factor); GKO_REGISTER_OPERATION(csr_transpose, csr::transpose); -GKO_REGISTER_OPERATION(convert_to_coo, csr::convert_to_coo); +GKO_REGISTER_OPERATION(convert_ptrs_to_idxs, components::convert_ptrs_to_idxs); } // anonymous namespace @@ -124,14 +125,12 @@ std::unique_ptr> ParIc::generate( auto l_vals_view = Array::view(exec, l_nnz, l_factor->get_values()); auto a_vals = Array{exec, l_vals_view}; - auto a_row_idxs = + auto a_row_idxs = Array{exec, l_nnz}; + auto a_col_idxs = Array::view(exec, l_nnz, l_factor->get_col_idxs()); - auto a_col_idxs = Array{exec, l_nnz}; auto a_lower_coo = CooMatrix::create(exec, matrix_size, std::move(a_vals), - std::move(a_row_idxs), std::move(a_col_idxs)); - exec->run(par_ic_factorization::make_convert_to_coo(l_factor.get(), - a_lower_coo.get())); + std::move(a_col_idxs), std::move(a_row_idxs)); // compute sqrt of diagonal entries exec->run(par_ic_factorization::make_init_factor(l_factor.get())); diff --git a/core/factorization/par_ict.cpp b/core/factorization/par_ict.cpp index 53641b88233..1c3ab05195c 100644 --- a/core/factorization/par_ict.cpp +++ b/core/factorization/par_ict.cpp @@ -45,6 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/base/utils.hpp" +#include "core/components/format_conversion_kernels.hpp" #include "core/factorization/factorization_kernels.hpp" #include "core/factorization/par_ict_kernels.hpp" #include "core/factorization/par_ilu_kernels.hpp" @@ -74,7 +75,7 @@ GKO_REGISTER_OPERATION(initialize_row_ptrs_l, GKO_REGISTER_OPERATION(initialize_l, factorization::initialize_l); GKO_REGISTER_OPERATION(csr_conj_transpose, csr::conj_transpose); -GKO_REGISTER_OPERATION(convert_to_coo, csr::convert_to_coo); +GKO_REGISTER_OPERATION(convert_ptrs_to_idxs, components::convert_ptrs_to_idxs); GKO_REGISTER_OPERATION(spgemm, csr::spgemm); @@ -84,7 +85,7 @@ GKO_REGISTER_OPERATION(spgemm, csr::spgemm); using par_ict_factorization::make_add_candidates; using par_ict_factorization::make_compute_factor; -using par_ict_factorization::make_convert_to_coo; +using par_ict_factorization::make_convert_ptrs_to_idxs; using par_ict_factorization::make_csr_conj_transpose; using par_ict_factorization::make_initialize_l; using par_ict_factorization::make_initialize_row_ptrs_l; @@ -242,7 +243,9 @@ void ParIctState::iterate() } // convert L into COO format - exec->run(make_convert_to_coo(l_new.get(), l_coo.get())); + exec->run(make_convert_ptrs_to_idxs(l_new->get_const_row_ptrs(), + l_new->get_size()[0], + l_coo->get_row_idxs())); // execute asynchronous iteration exec->run(make_compute_factor(system_matrix, l_new.get(), l_coo.get())); diff --git a/core/factorization/par_ilut.cpp b/core/factorization/par_ilut.cpp index a4555226ed2..b571873f2a2 100644 --- a/core/factorization/par_ilut.cpp +++ b/core/factorization/par_ilut.cpp @@ -45,6 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/base/utils.hpp" +#include "core/components/format_conversion_kernels.hpp" #include "core/factorization/factorization_kernels.hpp" #include "core/factorization/par_ilu_kernels.hpp" #include "core/factorization/par_ilut_kernels.hpp" @@ -74,7 +75,7 @@ GKO_REGISTER_OPERATION(initialize_row_ptrs_l_u, GKO_REGISTER_OPERATION(initialize_l_u, factorization::initialize_l_u); GKO_REGISTER_OPERATION(csr_transpose, csr::transpose); -GKO_REGISTER_OPERATION(convert_to_coo, csr::convert_to_coo); +GKO_REGISTER_OPERATION(convert_ptrs_to_idxs, components::convert_ptrs_to_idxs); GKO_REGISTER_OPERATION(spgemm, csr::spgemm); @@ -84,7 +85,7 @@ GKO_REGISTER_OPERATION(spgemm, csr::spgemm); using par_ilut_factorization::make_add_candidates; using par_ilut_factorization::make_compute_l_u_factors; -using par_ilut_factorization::make_convert_to_coo; +using par_ilut_factorization::make_convert_ptrs_to_idxs; using par_ilut_factorization::make_csr_transpose; using par_ilut_factorization::make_initialize_l_u; using par_ilut_factorization::make_initialize_row_ptrs_l_u; @@ -283,8 +284,12 @@ void ParIlutState::iterate() exec->run(make_csr_transpose(u_new.get(), u_new_csc.get())); // convert L' and U' into COO format - exec->run(make_convert_to_coo(l_new.get(), l_coo.get())); - exec->run(make_convert_to_coo(u_new.get(), u_coo.get())); + exec->run(make_convert_ptrs_to_idxs(l_new->get_const_row_ptrs(), + l_new->get_size()[0], + l_coo->get_row_idxs())); + exec->run(make_convert_ptrs_to_idxs(u_new->get_const_row_ptrs(), + u_new->get_size()[0], + u_coo->get_row_idxs())); // execute asynchronous iteration exec->run(make_compute_l_u_factors(system_matrix, l_new.get(), l_coo.get(), diff --git a/core/matrix/coo.cpp b/core/matrix/coo.cpp index dd145dc3b5d..f402c4835ee 100644 --- a/core/matrix/coo.cpp +++ b/core/matrix/coo.cpp @@ -48,6 +48,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/components/absolute_array_kernels.hpp" #include "core/components/fill_array_kernels.hpp" +#include "core/components/format_conversion_kernels.hpp" #include "core/matrix/coo_kernels.hpp" @@ -62,7 +63,7 @@ GKO_REGISTER_OPERATION(advanced_spmv, coo::advanced_spmv); GKO_REGISTER_OPERATION(spmv2, coo::spmv2); GKO_REGISTER_OPERATION(advanced_spmv2, coo::advanced_spmv2); GKO_REGISTER_OPERATION(fill_in_matrix_data, coo::fill_in_matrix_data); -GKO_REGISTER_OPERATION(convert_to_csr, coo::convert_to_csr); +GKO_REGISTER_OPERATION(convert_idxs_to_ptrs, components::convert_idxs_to_ptrs); GKO_REGISTER_OPERATION(convert_to_dense, coo::convert_to_dense); GKO_REGISTER_OPERATION(extract_diagonal, coo::extract_diagonal); GKO_REGISTER_OPERATION(fill_array, components::fill_array); @@ -153,7 +154,9 @@ void Coo::convert_to( result->get_strategy()); tmp->values_ = this->values_; tmp->col_idxs_ = this->col_idxs_; - exec->run(coo::make_convert_to_csr(this, tmp.get())); + exec->run(coo::make_convert_idxs_to_ptrs( + this->get_const_row_idxs(), this->get_num_stored_elements(), + this->get_size()[0], tmp->get_row_ptrs())); tmp->make_srow(); tmp->move_to(result); } @@ -168,7 +171,9 @@ void Coo::move_to(Csr* result) result->get_strategy()); tmp->values_ = std::move(this->values_); tmp->col_idxs_ = std::move(this->col_idxs_); - exec->run(coo::make_convert_to_csr(this, tmp.get())); + exec->run(coo::make_convert_idxs_to_ptrs(this->get_const_row_idxs(), nnz, + this->get_size()[0], + tmp->get_row_ptrs())); tmp->make_srow(); tmp->move_to(result); } diff --git a/core/matrix/coo_kernels.hpp b/core/matrix/coo_kernels.hpp index 2959c16b19e..58171ce7f1c 100644 --- a/core/matrix/coo_kernels.hpp +++ b/core/matrix/coo_kernels.hpp @@ -86,11 +86,6 @@ namespace kernels { const matrix::Coo* source, \ matrix::Dense* result) -#define GKO_DECLARE_COO_CONVERT_TO_CSR_KERNEL(ValueType, IndexType) \ - void convert_to_csr(std::shared_ptr exec, \ - const matrix::Coo* source, \ - matrix::Csr* result) - #define GKO_DECLARE_COO_EXTRACT_DIAGONAL_KERNEL(ValueType, IndexType) \ void extract_diagonal(std::shared_ptr exec, \ const matrix::Coo* orig, \ @@ -108,8 +103,6 @@ namespace kernels { template \ GKO_DECLARE_COO_FILL_IN_MATRIX_DATA_KERNEL(ValueType, IndexType); \ template \ - GKO_DECLARE_COO_CONVERT_TO_CSR_KERNEL(ValueType, IndexType); \ - template \ GKO_DECLARE_COO_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType); \ template \ GKO_DECLARE_COO_EXTRACT_DIAGONAL_KERNEL(ValueType, IndexType) diff --git a/core/matrix/csr.cpp b/core/matrix/csr.cpp index 473ecc9ef2c..4048e595279 100644 --- a/core/matrix/csr.cpp +++ b/core/matrix/csr.cpp @@ -50,6 +50,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/components/absolute_array_kernels.hpp" #include "core/components/device_matrix_data_kernels.hpp" #include "core/components/fill_array_kernels.hpp" +#include "core/components/format_conversion_kernels.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/csr_kernels.hpp" @@ -67,7 +68,7 @@ GKO_REGISTER_OPERATION(advanced_spgemm, csr::advanced_spgemm); GKO_REGISTER_OPERATION(spgeam, csr::spgeam); GKO_REGISTER_OPERATION(build_row_ptrs, components::build_row_ptrs); GKO_REGISTER_OPERATION(fill_in_matrix_data, csr::fill_in_matrix_data); -GKO_REGISTER_OPERATION(convert_to_coo, csr::convert_to_coo); +GKO_REGISTER_OPERATION(convert_ptrs_to_idxs, components::convert_ptrs_to_idxs); GKO_REGISTER_OPERATION(convert_to_dense, csr::convert_to_dense); GKO_REGISTER_OPERATION(convert_to_sellp, csr::convert_to_sellp); GKO_REGISTER_OPERATION(calculate_total_cols, csr::calculate_total_cols); @@ -187,7 +188,8 @@ void Csr::convert_to( exec, this->get_size(), this->get_num_stored_elements()); tmp->values_ = this->values_; tmp->col_idxs_ = this->col_idxs_; - exec->run(csr::make_convert_to_coo(this, tmp.get())); + exec->run(csr::make_convert_ptrs_to_idxs( + this->get_const_row_ptrs(), this->get_size()[0], tmp->get_row_idxs())); tmp->move_to(result); } diff --git a/core/matrix/csr_kernels.hpp b/core/matrix/csr_kernels.hpp index a71b2dde25f..97d9545a579 100644 --- a/core/matrix/csr_kernels.hpp +++ b/core/matrix/csr_kernels.hpp @@ -102,11 +102,6 @@ namespace kernels { const matrix::Csr* source, \ matrix::Dense* result) -#define GKO_DECLARE_CSR_CONVERT_TO_COO_KERNEL(ValueType, IndexType) \ - void convert_to_coo(std::shared_ptr exec, \ - const matrix::Csr* source, \ - matrix::Coo* result) - #define GKO_DECLARE_CSR_CONVERT_TO_ELL_KERNEL(ValueType, IndexType) \ void convert_to_ell(std::shared_ptr exec, \ const matrix::Csr* source, \ @@ -232,8 +227,6 @@ namespace kernels { template \ GKO_DECLARE_CSR_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType); \ template \ - GKO_DECLARE_CSR_CONVERT_TO_COO_KERNEL(ValueType, IndexType); \ - template \ GKO_DECLARE_CSR_CONVERT_TO_SELLP_KERNEL(ValueType, IndexType); \ template \ GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL(ValueType, IndexType); \ diff --git a/cuda/matrix/coo_kernels.cu b/cuda/matrix/coo_kernels.cu index 458d8928323..f5989b51d99 100644 --- a/cuda/matrix/coo_kernels.cu +++ b/cuda/matrix/coo_kernels.cu @@ -181,38 +181,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_COO_ADVANCED_SPMV2_KERNEL); -template -void convert_row_idxs_to_ptrs(std::shared_ptr exec, - const IndexType* idxs, size_type num_nonzeros, - IndexType* ptrs, size_type length) -{ - const auto grid_dim = ceildiv(num_nonzeros, default_block_size); - - kernel::convert_row_idxs_to_ptrs<<>>( - as_cuda_type(idxs), num_nonzeros, as_cuda_type(ptrs), length); -} - - -template -void convert_to_csr(std::shared_ptr exec, - const matrix::Coo* source, - matrix::Csr* result) -{ - auto num_rows = result->get_size()[0]; - - auto row_ptrs = result->get_row_ptrs(); - const auto nnz = result->get_num_stored_elements(); - - const auto source_row_idxs = source->get_const_row_idxs(); - - convert_row_idxs_to_ptrs(exec, source_row_idxs, nnz, row_ptrs, - num_rows + 1); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_COO_CONVERT_TO_CSR_KERNEL); - - template void convert_to_dense(std::shared_ptr exec, const matrix::Coo* source, diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.cu index 4b6415c0b65..9c816d08902 100644 --- a/cuda/matrix/csr_kernels.cu +++ b/cuda/matrix/csr_kernels.cu @@ -847,35 +847,6 @@ void spgeam(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL); -template -void convert_row_ptrs_to_idxs(std::shared_ptr exec, - const IndexType* ptrs, size_type num_rows, - IndexType* idxs) -{ - const auto grid_dim = ceildiv(num_rows, default_block_size); - - kernel::convert_row_ptrs_to_idxs<<>>( - num_rows, as_cuda_type(ptrs), as_cuda_type(idxs)); -} - - -template -void convert_to_coo(std::shared_ptr exec, - const matrix::Csr* source, - matrix::Coo* result) -{ - auto num_rows = result->get_size()[0]; - - auto row_idxs = result->get_row_idxs(); - const auto source_row_ptrs = source->get_const_row_ptrs(); - - convert_row_ptrs_to_idxs(exec, source_row_ptrs, num_rows, row_idxs); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CONVERT_TO_COO_KERNEL); - - template void convert_to_dense(std::shared_ptr exec, const matrix::Csr* source, diff --git a/cuda/matrix/fbcsr_kernels.cu b/cuda/matrix/fbcsr_kernels.cu index 4df5489b2dc..ada8752a22d 100644 --- a/cuda/matrix/fbcsr_kernels.cu +++ b/cuda/matrix/fbcsr_kernels.cu @@ -56,6 +56,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/base/block_sizes.hpp" #include "core/components/device_matrix_data_kernels.hpp" #include "core/components/fill_array_kernels.hpp" +#include "core/components/format_conversion_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "cuda/base/config.hpp" #include "cuda/base/cublas_bindings.hpp" diff --git a/cuda/matrix/hybrid_kernels.cu b/cuda/matrix/hybrid_kernels.cu index 7c22833204f..7514f4a6640 100644 --- a/cuda/matrix/hybrid_kernels.cu +++ b/cuda/matrix/hybrid_kernels.cu @@ -46,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common/unified/base/kernel_launch.hpp" #include "core/components/fill_array_kernels.hpp" +#include "core/components/format_conversion_kernels.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/coo_kernels.hpp" #include "core/matrix/ell_kernels.hpp" @@ -103,17 +104,15 @@ void convert_to_csr(std::shared_ptr exec, const auto coo_num_stored_elements = source->get_coo_num_stored_elements(); // Compute the row offset of Coo without zeros - size_type grid_num = ceildiv(coo_num_stored_elements, default_block_size); - coo::kernel::convert_row_idxs_to_ptrs<<>>( - as_cuda_type(coo_row), coo_num_stored_elements, - as_cuda_type(coo_offset.get_data()), num_rows + 1); + components::convert_idxs_to_ptrs(exec, coo_row, coo_num_stored_elements, + num_rows, coo_offset.get_data()); // Compute the row ptrs of Csr auto row_ptrs = result->get_row_ptrs(); auto coo_row_ptrs = Array(exec, num_rows); components::fill_array(exec, row_ptrs, num_rows + 1, zero()); - grid_num = ceildiv(num_rows, warps_in_block); + size_type grid_num = ceildiv(num_rows, warps_in_block); ell::kernel::count_nnz_per_row<<>>( num_rows, max_nnz_per_row, stride, as_cuda_type(ell_val), as_cuda_type(row_ptrs)); diff --git a/dpcpp/components/format_conversion.dp.hpp b/dpcpp/components/format_conversion.dp.hpp index cb757d7c091..b0e6cb256c5 100644 --- a/dpcpp/components/format_conversion.dp.hpp +++ b/dpcpp/components/format_conversion.dp.hpp @@ -57,46 +57,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace gko { namespace kernels { namespace dpcpp { -namespace ell { -namespace kernel { - - -/** - * @internal - * - * It counts the number of explicit nonzeros per row of Ell. - */ -template -void count_nnz_per_row(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, size_type num_rows, - size_type max_nnz_per_row, size_type stride, - const ValueType* values, IndexType* result); - - -} // namespace kernel -} // namespace ell - - namespace coo { -namespace kernel { - - -/** - * @internal - * - * It converts the row index of Coo to the row pointer of Csr. - */ -template -void convert_row_idxs_to_ptrs(dim3 grid, dim3 block, - size_type dynamic_shared_memory, - sycl::queue* queue, const IndexType* idxs, - size_type num_nonzeros, IndexType* ptrs, - size_type length); - - -} // namespace kernel - - namespace host_kernel { diff --git a/dpcpp/matrix/coo_kernels.dp.cpp b/dpcpp/matrix/coo_kernels.dp.cpp index d981ad237fd..55f80e8109c 100644 --- a/dpcpp/matrix/coo_kernels.dp.cpp +++ b/dpcpp/matrix/coo_kernels.dp.cpp @@ -471,39 +471,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_COO_ADVANCED_SPMV2_KERNEL); -template -void convert_row_idxs_to_ptrs(std::shared_ptr exec, - const IndexType* idxs, size_type num_nonzeros, - IndexType* ptrs, size_type length) -{ - const auto grid_dim = ceildiv(num_nonzeros, default_block_size); - - kernel::convert_row_idxs_to_ptrs(grid_dim, default_block_size, 0, - exec->get_queue(), idxs, num_nonzeros, - ptrs, length); -} - - -template -void convert_to_csr(std::shared_ptr exec, - const matrix::Coo* source, - matrix::Csr* result) -{ - auto num_rows = result->get_size()[0]; - - auto row_ptrs = result->get_row_ptrs(); - const auto nnz = result->get_num_stored_elements(); - - const auto source_row_idxs = source->get_const_row_idxs(); - - convert_row_idxs_to_ptrs(exec, source_row_idxs, nnz, row_ptrs, - num_rows + 1); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_COO_CONVERT_TO_CSR_KERNEL); - - template void convert_to_dense(std::shared_ptr exec, const matrix::Coo* source, diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp index 3d2b8839fe4..5cb9757f27b 100644 --- a/dpcpp/matrix/csr_kernels.dp.cpp +++ b/dpcpp/matrix/csr_kernels.dp.cpp @@ -2224,23 +2224,6 @@ void convert_row_ptrs_to_idxs(std::shared_ptr exec, } -template -void convert_to_coo(std::shared_ptr exec, - const matrix::Csr* source, - matrix::Coo* result) -{ - auto num_rows = result->get_size()[0]; - - auto row_idxs = result->get_row_idxs(); - const auto source_row_ptrs = source->get_const_row_ptrs(); - - convert_row_ptrs_to_idxs(exec, source_row_ptrs, num_rows, row_idxs); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CONVERT_TO_COO_KERNEL); - - template void convert_to_dense(std::shared_ptr exec, const matrix::Csr* source, diff --git a/hip/matrix/coo_kernels.hip.cpp b/hip/matrix/coo_kernels.hip.cpp index 711b70ff2e4..babb8dbc096 100644 --- a/hip/matrix/coo_kernels.hip.cpp +++ b/hip/matrix/coo_kernels.hip.cpp @@ -190,39 +190,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_COO_ADVANCED_SPMV2_KERNEL); -template -void convert_row_idxs_to_ptrs(std::shared_ptr exec, - const IndexType* idxs, size_type num_nonzeros, - IndexType* ptrs, size_type length) -{ - const auto grid_dim = ceildiv(num_nonzeros, default_block_size); - - hipLaunchKernelGGL(kernel::convert_row_idxs_to_ptrs, dim3(grid_dim), - dim3(default_block_size), 0, 0, as_hip_type(idxs), - num_nonzeros, as_hip_type(ptrs), length); -} - - -template -void convert_to_csr(std::shared_ptr exec, - const matrix::Coo* source, - matrix::Csr* result) -{ - auto num_rows = result->get_size()[0]; - - auto row_ptrs = result->get_row_ptrs(); - const auto nnz = result->get_num_stored_elements(); - - const auto source_row_idxs = source->get_const_row_idxs(); - - convert_row_idxs_to_ptrs(exec, source_row_idxs, nnz, row_ptrs, - num_rows + 1); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_COO_CONVERT_TO_CSR_KERNEL); - - template void convert_to_dense(std::shared_ptr exec, const matrix::Coo* source, diff --git a/hip/matrix/csr_kernels.hip.cpp b/hip/matrix/csr_kernels.hip.cpp index a4d359c702a..f3e3f133d90 100644 --- a/hip/matrix/csr_kernels.hip.cpp +++ b/hip/matrix/csr_kernels.hip.cpp @@ -696,36 +696,6 @@ void spgeam(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL); -template -void convert_row_ptrs_to_idxs(std::shared_ptr exec, - const IndexType* ptrs, size_type num_rows, - IndexType* idxs) -{ - const auto grid_dim = ceildiv(num_rows, default_block_size); - - hipLaunchKernelGGL(kernel::convert_row_ptrs_to_idxs, dim3(grid_dim), - dim3(default_block_size), 0, 0, num_rows, - as_hip_type(ptrs), as_hip_type(idxs)); -} - - -template -void convert_to_coo(std::shared_ptr exec, - const matrix::Csr* source, - matrix::Coo* result) -{ - auto num_rows = result->get_size()[0]; - - auto row_idxs = result->get_row_idxs(); - const auto source_row_ptrs = source->get_const_row_ptrs(); - - convert_row_ptrs_to_idxs(exec, source_row_ptrs, num_rows, row_idxs); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CONVERT_TO_COO_KERNEL); - - template void convert_to_dense(std::shared_ptr exec, const matrix::Csr* source, diff --git a/hip/matrix/fbcsr_kernels.hip.cpp b/hip/matrix/fbcsr_kernels.hip.cpp index 0513b484e65..530f42e8978 100644 --- a/hip/matrix/fbcsr_kernels.hip.cpp +++ b/hip/matrix/fbcsr_kernels.hip.cpp @@ -56,6 +56,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common/unified/base/kernel_launch.hpp" #include "core/components/device_matrix_data_kernels.hpp" #include "core/components/fill_array_kernels.hpp" +#include "core/components/format_conversion_kernels.hpp" #include "hip/base/config.hip.hpp" #include "hip/components/cooperative_groups.hip.hpp" diff --git a/hip/matrix/hybrid_kernels.hip.cpp b/hip/matrix/hybrid_kernels.hip.cpp index 0207c0b207b..f65084f657c 100644 --- a/hip/matrix/hybrid_kernels.hip.cpp +++ b/hip/matrix/hybrid_kernels.hip.cpp @@ -47,6 +47,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common/unified/base/kernel_launch.hpp" #include "core/components/fill_array_kernels.hpp" +#include "core/components/format_conversion_kernels.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/coo_kernels.hpp" #include "core/matrix/ell_kernels.hpp" @@ -104,18 +105,15 @@ void convert_to_csr(std::shared_ptr exec, const auto coo_num_stored_elements = source->get_coo_num_stored_elements(); // Compute the row offset of Coo without zeros - size_type grid_num = ceildiv(coo_num_stored_elements, default_block_size); - hipLaunchKernelGGL(coo::kernel::convert_row_idxs_to_ptrs, dim3(grid_num), - dim3(default_block_size), 0, 0, as_hip_type(coo_row), - coo_num_stored_elements, - as_hip_type(coo_offset.get_data()), num_rows + 1); + components::convert_idxs_to_ptrs(exec, coo_row, coo_num_stored_elements, + num_rows, coo_offset.get_data()); // Compute the row ptrs of Csr auto row_ptrs = result->get_row_ptrs(); auto coo_row_ptrs = Array(exec, num_rows); components::fill_array(exec, row_ptrs, num_rows + 1, zero()); - grid_num = ceildiv(num_rows, warps_in_block); + size_type grid_num = ceildiv(num_rows, warps_in_block); hipLaunchKernelGGL(ell::kernel::count_nnz_per_row, dim3(grid_num), dim3(default_block_size), 0, 0, num_rows, max_nnz_per_row, stride, as_hip_type(ell_val), diff --git a/omp/components/format_conversion.hpp b/omp/components/format_conversion.hpp deleted file mode 100644 index 80ea0814d64..00000000000 --- a/omp/components/format_conversion.hpp +++ /dev/null @@ -1,130 +0,0 @@ -/************************************************************* -Copyright (c) 2017-2022, the Ginkgo authors -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: - -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in the -documentation and/or other materials provided with the distribution. - -3. Neither the name of the copyright holder nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*************************************************************/ - -#include -#include - - -#include - - -#include - - -#include "core/components/prefix_sum_kernels.hpp" - - -namespace gko { -namespace kernels { -namespace omp { - - -/** - * @internal - * - * Converts an array of indexes `idxs` in any order to an array of pointers - * `ptrs`. This is used for transposing a csr matrix when calculating the row - * pointers of the transposed matrix out of the column indices of the original - * matrix. - */ -template -inline void convert_unsorted_idxs_to_ptrs(const IndexType* idxs, - size_type num_nonzeros, - IndexType* ptrs, size_type length) -{ -#pragma omp parallel for schedule(static, \ - ceildiv(length, omp_get_max_threads())) - for (size_type i = 0; i < length; i++) { - ptrs[i] = 0; - } - - std::for_each(idxs, idxs + num_nonzeros, [&](IndexType v) { - if (v + 1 < length) { - ++ptrs[v + 1]; - } - }); - - std::partial_sum(ptrs, ptrs + length, ptrs); -} - - -/** - * @internal - * - * Converts an array of indexes `idxs` which are already stored in an increasing - * order to an array of pointers `ptrs`. This is used to calculate the row - * pointers when converting a coo matrix to a csr matrix. - */ -template -inline void convert_sorted_idxs_to_ptrs(const IndexType* idxs, - size_type num_nonzeros, IndexType* ptrs, - size_type num_rows) -{ - ptrs[0] = 0; - - if (num_nonzeros == 0) { -#pragma omp parallel for - for (size_type row = 0; row < num_rows; row++) { - ptrs[row + 1] = 0; - } - } else { - // add virtual sentinel values 0 and num_rows to handle empty first and - // last rows -#pragma omp parallel for - for (size_type i = 0; i <= num_nonzeros; i++) { - auto begin_row = i == 0 ? size_type{} : idxs[i - 1]; - auto end_row = i == num_nonzeros ? num_rows : idxs[i]; - for (auto row = begin_row; row < end_row; row++) { - ptrs[row + 1] = i; - } - } - } -} - - -template -inline void convert_ptrs_to_idxs(const IndexType* ptrs, size_type num_rows, - IndexType* idxs) -{ -#pragma omp parallel for - for (size_type row = 0; row < num_rows; ++row) { - for (size_type i = ptrs[row]; i < static_cast(ptrs[row + 1]); - ++i) { - idxs[i] = row; - } - } -} - - -} // namespace omp -} // namespace kernels -} // namespace gko diff --git a/omp/matrix/coo_kernels.cpp b/omp/matrix/coo_kernels.cpp index f01e41a9033..1a9083f3d80 100644 --- a/omp/matrix/coo_kernels.cpp +++ b/omp/matrix/coo_kernels.cpp @@ -47,7 +47,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/matrix/dense_kernels.hpp" #include "omp/components/atomic.hpp" -#include "omp/components/format_conversion.hpp" namespace gko { @@ -356,34 +355,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_COO_ADVANCED_SPMV2_KERNEL); -template -void convert_row_idxs_to_ptrs(std::shared_ptr exec, - const IndexType* idxs, size_type num_nonzeros, - IndexType* ptrs, size_type num_rows) -{ - convert_sorted_idxs_to_ptrs(idxs, num_nonzeros, ptrs, num_rows); -} - - -template -void convert_to_csr(std::shared_ptr exec, - const matrix::Coo* source, - matrix::Csr* result) -{ - auto num_rows = result->get_size()[0]; - - auto row_ptrs = result->get_row_ptrs(); - const auto nnz = result->get_num_stored_elements(); - - const auto source_row_idxs = source->get_const_row_idxs(); - - convert_row_idxs_to_ptrs(exec, source_row_idxs, nnz, row_ptrs, num_rows); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_COO_CONVERT_TO_CSR_KERNEL); - - template void convert_to_dense(std::shared_ptr exec, const matrix::Coo* source, diff --git a/omp/matrix/csr_kernels.cpp b/omp/matrix/csr_kernels.cpp index 2cb2732f87a..22087a20ead 100644 --- a/omp/matrix/csr_kernels.cpp +++ b/omp/matrix/csr_kernels.cpp @@ -53,10 +53,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/base/allocator.hpp" #include "core/base/iterator_factory.hpp" #include "core/base/utils.hpp" +#include "core/components/fill_array_kernels.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/csr_builder.hpp" #include "omp/components/csr_spgeam.hpp" -#include "omp/components/format_conversion.hpp" namespace gko { @@ -553,32 +553,6 @@ void spgeam(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL); -template -void convert_row_ptrs_to_idxs(std::shared_ptr exec, - const IndexType* ptrs, size_type num_rows, - IndexType* idxs) -{ - convert_ptrs_to_idxs(ptrs, num_rows, idxs); -} - - -template -void convert_to_coo(std::shared_ptr exec, - const matrix::Csr* source, - matrix::Coo* result) -{ - auto num_rows = result->get_size()[0]; - - auto row_idxs = result->get_row_idxs(); - const auto source_row_ptrs = source->get_const_row_ptrs(); - - convert_row_ptrs_to_idxs(exec, source_row_ptrs, num_rows, row_idxs); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CONVERT_TO_COO_KERNEL); - - template void convert_to_dense(std::shared_ptr exec, const matrix::Csr* source, @@ -660,9 +634,12 @@ void transpose_and_transform(std::shared_ptr exec, auto orig_num_rows = orig->get_size()[0]; auto orig_nnz = orig_row_ptrs[orig_num_rows]; - trans_row_ptrs[0] = 0; - convert_unsorted_idxs_to_ptrs(orig_col_idxs, orig_nnz, trans_row_ptrs + 1, - orig_num_cols); + components::fill_array(exec, trans_row_ptrs, orig_num_cols + 1, + IndexType{}); + for (size_type i = 0; i < orig_nnz; i++) { + trans_row_ptrs[orig_col_idxs[i] + 1]++; + } + components::prefix_sum(exec, trans_row_ptrs + 1, orig_num_cols); convert_csr_to_csc(orig_num_rows, orig_row_ptrs, orig_col_idxs, orig_vals, trans_col_idxs, trans_row_ptrs + 1, trans_vals, op); diff --git a/omp/matrix/ell_kernels.cpp b/omp/matrix/ell_kernels.cpp index b3fe7cd92ac..913ce97ec3f 100644 --- a/omp/matrix/ell_kernels.cpp +++ b/omp/matrix/ell_kernels.cpp @@ -47,7 +47,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "accessor/reduced_row_major.hpp" #include "core/base/mixed_precision_types.hpp" -#include "omp/components/format_conversion.hpp" namespace gko { diff --git a/omp/matrix/fbcsr_kernels.cpp b/omp/matrix/fbcsr_kernels.cpp index 152eaf25c76..0ce83d93a03 100644 --- a/omp/matrix/fbcsr_kernels.cpp +++ b/omp/matrix/fbcsr_kernels.cpp @@ -52,8 +52,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/base/block_sizes.hpp" #include "core/base/iterator_factory.hpp" #include "core/base/utils.hpp" +#include "core/components/fill_array_kernels.hpp" +#include "core/components/prefix_sum_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "omp/components/format_conversion.hpp" namespace gko { @@ -262,6 +263,7 @@ void convert_fbcsr_to_fbcsc(const IndexType num_blk_rows, const int blksz, template void transpose_and_transform( + std::shared_ptr exec, matrix::Fbcsr* const trans, const matrix::Fbcsr* const orig, UnaryOperator op) { @@ -277,9 +279,11 @@ void transpose_and_transform( const IndexType nbrows = orig->get_num_block_rows(); auto orig_nbnz = orig_row_ptrs[nbrows]; - trans_row_ptrs[0] = 0; - convert_unsorted_idxs_to_ptrs(orig_col_idxs, orig_nbnz, trans_row_ptrs + 1, - nbcols); + components::fill_array(exec, trans_row_ptrs, nbcols + 1, IndexType{}); + for (size_type i = 0; i < orig_nbnz; i++) { + trans_row_ptrs[orig_col_idxs[i] + 1]++; + } + components::prefix_sum(exec, trans_row_ptrs + 1, nbcols); convert_fbcsr_to_fbcsc( nbrows, bs, orig_row_ptrs, orig_col_idxs, orig_vals, trans_col_idxs, @@ -292,7 +296,8 @@ void transpose(std::shared_ptr exec, const matrix::Fbcsr* const orig, matrix::Fbcsr* const trans) { - transpose_and_transform(trans, orig, [](const ValueType x) { return x; }); + transpose_and_transform(exec, trans, orig, + [](const ValueType x) { return x; }); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -304,7 +309,7 @@ void conj_transpose(std::shared_ptr exec, const matrix::Fbcsr* const orig, matrix::Fbcsr* const trans) { - transpose_and_transform(trans, orig, + transpose_and_transform(exec, trans, orig, [](const ValueType x) { return conj(x); }); } diff --git a/omp/matrix/hybrid_kernels.cpp b/omp/matrix/hybrid_kernels.cpp index 0776dd53485..309fb23a9f5 100644 --- a/omp/matrix/hybrid_kernels.cpp +++ b/omp/matrix/hybrid_kernels.cpp @@ -42,8 +42,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/components/format_conversion_kernels.hpp" +#include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/ell_kernels.hpp" -#include "omp/components/format_conversion.hpp" namespace gko { @@ -151,7 +152,8 @@ void convert_to_csr(std::shared_ptr exec, const auto num_rows = source->get_size()[0]; auto coo_row_ptrs_array = Array(exec, num_rows + 1); auto coo_row_ptrs = coo_row_ptrs_array.get_data(); - convert_sorted_idxs_to_ptrs(coo_row, coo_nnz, coo_row_ptrs, num_rows); + components::convert_idxs_to_ptrs(exec, coo_row, coo_nnz, num_rows, + coo_row_ptrs); // Compute the row sizes of Coo without zeros #pragma omp parallel for diff --git a/omp/matrix/sparsity_csr_kernels.cpp b/omp/matrix/sparsity_csr_kernels.cpp index 331f6ab5988..36a4520c1f8 100644 --- a/omp/matrix/sparsity_csr_kernels.cpp +++ b/omp/matrix/sparsity_csr_kernels.cpp @@ -47,7 +47,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/base/iterator_factory.hpp" -#include "omp/components/format_conversion.hpp" +#include "core/components/fill_array_kernels.hpp" +#include "core/components/prefix_sum_kernels.hpp" namespace gko { @@ -210,9 +211,12 @@ void transpose_and_transform( auto orig_num_rows = orig->get_size()[0]; auto orig_nnz = orig_row_ptrs[orig_num_rows]; - trans_row_ptrs[0] = 0; - convert_unsorted_idxs_to_ptrs(orig_col_idxs, orig_nnz, trans_row_ptrs + 1, - orig_num_cols); + components::fill_array(exec, trans_row_ptrs, orig_num_cols + 1, + IndexType{}); + for (size_type i = 0; i < orig_nnz; i++) { + trans_row_ptrs[orig_col_idxs[i] + 1]++; + } + components::prefix_sum(exec, trans_row_ptrs + 1, orig_num_cols); convert_sparsity_to_csc(orig_num_rows, orig_row_ptrs, orig_col_idxs, trans_col_idxs, trans_row_ptrs + 1); diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt index 83f770bc941..ed27f65f48d 100644 --- a/reference/CMakeLists.txt +++ b/reference/CMakeLists.txt @@ -6,6 +6,7 @@ target_sources(ginkgo_reference components/absolute_array_kernels.cpp components/device_matrix_data_kernels.cpp components/fill_array_kernels.cpp + components/format_conversion_kernels.cpp components/reduce_array_kernels.cpp components/precision_conversion_kernels.cpp components/prefix_sum_kernels.cpp diff --git a/reference/components/format_conversion.hpp b/reference/components/format_conversion.hpp deleted file mode 100644 index ab507d5fb41..00000000000 --- a/reference/components/format_conversion.hpp +++ /dev/null @@ -1,76 +0,0 @@ -/************************************************************* -Copyright (c) 2017-2022, the Ginkgo authors -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: - -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in the -documentation and/or other materials provided with the distribution. - -3. Neither the name of the copyright holder nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*************************************************************/ - -#include -#include - - -#include - - -namespace gko { -namespace kernels { -namespace reference { - - -template -inline void convert_idxs_to_ptrs(const IndexType* idxs, size_type num_nonzeros, - IndexType* ptrs, size_type length) -{ - std::fill(ptrs, ptrs + length, 0); - std::for_each(idxs, idxs + num_nonzeros, [&](size_type v) { - if (v + 1 < length) { - ++ptrs[v + 1]; - } - }); - std::partial_sum(ptrs, ptrs + length, ptrs); -} - - -template -inline void convert_ptrs_to_idxs(const IndexType* ptrs, size_type num_rows, - IndexType* idxs) -{ - size_type ind = 0; - - for (size_type row = 0; row < num_rows; ++row) { - for (size_type i = ptrs[row]; i < static_cast(ptrs[row + 1]); - ++i) { - idxs[ind++] = row; - } - } -} - - -} // namespace reference -} // namespace kernels -} // namespace gko diff --git a/reference/components/format_conversion_kernels.cpp b/reference/components/format_conversion_kernels.cpp new file mode 100644 index 00000000000..47ff5a2e9ff --- /dev/null +++ b/reference/components/format_conversion_kernels.cpp @@ -0,0 +1,100 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/components/format_conversion_kernels.hpp" + + +#include + + +#include "core/components/fill_array_kernels.hpp" +#include "core/components/prefix_sum_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace reference { +namespace components { + + +template +void convert_ptrs_to_idxs(std::shared_ptr exec, + const RowPtrType* ptrs, size_type num_blocks, + IndexType* idxs) +{ + for (size_type block = 0; block < num_blocks; block++) { + auto begin = ptrs[block]; + auto end = ptrs[block + 1]; + for (auto i = begin; i < end; i++) { + idxs[i] = block; + } + } +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CONVERT_PTRS_TO_IDXS32); +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CONVERT_PTRS_TO_IDXS64); + + +template +void convert_idxs_to_ptrs(std::shared_ptr exec, + const IndexType* idxs, size_type num_idxs, + size_type num_blocks, RowPtrType* ptrs) +{ + fill_array(exec, ptrs, num_blocks + 1, RowPtrType{}); + for (size_type i = 0; i < num_idxs; i++) { + ptrs[idxs[i]]++; + } + prefix_sum(exec, ptrs, num_blocks + 1); +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CONVERT_IDXS_TO_PTRS32); +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CONVERT_IDXS_TO_PTRS64); + + +template +void convert_ptrs_to_sizes(std::shared_ptr exec, + const RowPtrType* ptrs, size_type num_blocks, + IndexType* sizes) +{ + for (size_type block = 0; block < num_blocks; block++) { + sizes[block] = ptrs[block + 1] - ptrs[block]; + } +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CONVERT_PTRS_TO_SIZES32); +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CONVERT_PTRS_TO_SIZES64); + + +} // namespace components +} // namespace reference +} // namespace kernels +} // namespace gko diff --git a/reference/matrix/coo_kernels.cpp b/reference/matrix/coo_kernels.cpp index 3f0d9200a27..a67b7b2beec 100644 --- a/reference/matrix/coo_kernels.cpp +++ b/reference/matrix/coo_kernels.cpp @@ -39,8 +39,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/components/format_conversion_kernels.hpp" #include "core/matrix/dense_kernels.hpp" -#include "reference/components/format_conversion.hpp" namespace gko { @@ -148,35 +148,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_COO_FILL_IN_MATRIX_DATA_KERNEL); -template -void convert_row_idxs_to_ptrs(std::shared_ptr exec, - const IndexType* idxs, size_type num_nonzeros, - IndexType* ptrs, size_type length) -{ - convert_idxs_to_ptrs(idxs, num_nonzeros, ptrs, length); -} - - -template -void convert_to_csr(std::shared_ptr exec, - const matrix::Coo* source, - matrix::Csr* result) -{ - auto num_rows = result->get_size()[0]; - - auto row_ptrs = result->get_row_ptrs(); - const auto nnz = result->get_num_stored_elements(); - - const auto source_row_idxs = source->get_const_row_idxs(); - - convert_row_idxs_to_ptrs(exec, source_row_idxs, nnz, row_ptrs, - num_rows + 1); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_COO_CONVERT_TO_CSR_KERNEL); - - template void convert_to_dense(std::shared_ptr exec, const matrix::Coo* source, diff --git a/reference/matrix/csr_kernels.cpp b/reference/matrix/csr_kernels.cpp index 51d0829a475..2e7272afbbd 100644 --- a/reference/matrix/csr_kernels.cpp +++ b/reference/matrix/csr_kernels.cpp @@ -51,10 +51,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/base/allocator.hpp" #include "core/base/iterator_factory.hpp" +#include "core/components/fill_array_kernels.hpp" +#include "core/components/format_conversion_kernels.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/csr_builder.hpp" #include "reference/components/csr_spgeam.hpp" -#include "reference/components/format_conversion.hpp" namespace gko { @@ -373,31 +374,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_FILL_IN_MATRIX_DATA_KERNEL); -template -void convert_row_ptrs_to_idxs(std::shared_ptr exec, - const IndexType* ptrs, size_type num_rows, - IndexType* idxs) -{ - convert_ptrs_to_idxs(ptrs, num_rows, idxs); -} - - -template -void convert_to_coo(std::shared_ptr exec, - const matrix::Csr* source, - matrix::Coo* result) -{ - auto num_rows = result->get_size()[0]; - - auto row_idxs = result->get_row_idxs(); - const auto source_row_ptrs = source->get_const_row_ptrs(); - - convert_row_ptrs_to_idxs(exec, source_row_ptrs, num_rows, row_idxs); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CONVERT_TO_COO_KERNEL); - template void convert_to_dense(std::shared_ptr exec, const matrix::Csr* source, @@ -596,9 +572,12 @@ void transpose_and_transform(std::shared_ptr exec, auto orig_num_rows = orig->get_size()[0]; auto orig_nnz = orig_row_ptrs[orig_num_rows]; - trans_row_ptrs[0] = 0; - convert_idxs_to_ptrs(orig_col_idxs, orig_nnz, trans_row_ptrs + 1, - orig_num_cols); + components::fill_array(exec, trans_row_ptrs, orig_num_cols + 1, + IndexType{}); + for (size_type i = 0; i < orig_nnz; i++) { + trans_row_ptrs[orig_col_idxs[i] + 1]++; + } + components::prefix_sum(exec, trans_row_ptrs + 1, orig_num_cols); convert_csr_to_csc(orig_num_rows, orig_row_ptrs, orig_col_idxs, orig_vals, trans_col_idxs, trans_row_ptrs + 1, trans_vals, op); diff --git a/reference/matrix/fbcsr_kernels.cpp b/reference/matrix/fbcsr_kernels.cpp index 21659afd81c..f181cfbaefc 100644 --- a/reference/matrix/fbcsr_kernels.cpp +++ b/reference/matrix/fbcsr_kernels.cpp @@ -50,10 +50,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/base/block_sizes.hpp" #include "core/base/iterator_factory.hpp" #include "core/base/utils.hpp" +#include "core/components/fill_array_kernels.hpp" +#include "core/components/format_conversion_kernels.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/fbcsr_builder.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "reference/components/format_conversion.hpp" namespace gko { @@ -350,6 +351,7 @@ void convert_fbcsr_to_fbcsc(const IndexType num_blk_rows, const int blksz, template void transpose_and_transform( + std::shared_ptr exec, matrix::Fbcsr* const trans, const matrix::Fbcsr* const orig, UnaryOperator op) { @@ -365,8 +367,11 @@ void transpose_and_transform( const IndexType nbrows = orig->get_num_block_rows(); auto orig_nbnz = orig_row_ptrs[nbrows]; - trans_row_ptrs[0] = 0; - convert_idxs_to_ptrs(orig_col_idxs, orig_nbnz, trans_row_ptrs + 1, nbcols); + components::fill_array(exec, trans_row_ptrs, nbcols + 1, IndexType{}); + for (size_type i = 0; i < orig_nbnz; i++) { + trans_row_ptrs[orig_col_idxs[i] + 1]++; + } + components::prefix_sum(exec, trans_row_ptrs + 1, nbcols); convert_fbcsr_to_fbcsc( nbrows, bs, orig_row_ptrs, orig_col_idxs, orig_vals, trans_col_idxs, @@ -375,11 +380,12 @@ void transpose_and_transform( template -void transpose(std::shared_ptr, +void transpose(std::shared_ptr exec, const matrix::Fbcsr* const orig, matrix::Fbcsr* const trans) { - transpose_and_transform(trans, orig, [](const ValueType x) { return x; }); + transpose_and_transform(exec, trans, orig, + [](const ValueType x) { return x; }); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -387,11 +393,11 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template -void conj_transpose(std::shared_ptr, +void conj_transpose(std::shared_ptr exec, const matrix::Fbcsr* const orig, matrix::Fbcsr* const trans) { - transpose_and_transform(trans, orig, + transpose_and_transform(exec, trans, orig, [](const ValueType x) { return conj(x); }); } diff --git a/reference/matrix/hybrid_kernels.cpp b/reference/matrix/hybrid_kernels.cpp index 571e17ad5a4..6af99e4e3eb 100644 --- a/reference/matrix/hybrid_kernels.cpp +++ b/reference/matrix/hybrid_kernels.cpp @@ -41,8 +41,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/components/format_conversion_kernels.hpp" #include "core/matrix/ell_kernels.hpp" -#include "reference/components/format_conversion.hpp" namespace gko { diff --git a/reference/matrix/sparsity_csr_kernels.cpp b/reference/matrix/sparsity_csr_kernels.cpp index a57179d6969..afe7a2cdc79 100644 --- a/reference/matrix/sparsity_csr_kernels.cpp +++ b/reference/matrix/sparsity_csr_kernels.cpp @@ -44,7 +44,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/base/iterator_factory.hpp" -#include "reference/components/format_conversion.hpp" +#include "core/components/fill_array_kernels.hpp" +#include "core/components/prefix_sum_kernels.hpp" namespace gko { @@ -220,9 +221,12 @@ void transpose_and_transform( auto orig_num_rows = orig->get_size()[0]; auto orig_nnz = orig_row_ptrs[orig_num_rows]; - trans_row_ptrs[0] = 0; - convert_idxs_to_ptrs(orig_col_idxs, orig_nnz, trans_row_ptrs + 1, - orig_num_cols); + components::fill_array(exec, trans_row_ptrs, orig_num_cols + 1, + IndexType{}); + for (size_type i = 0; i < orig_nnz; i++) { + trans_row_ptrs[orig_col_idxs[i] + 1]++; + } + components::prefix_sum(exec, trans_row_ptrs + 1, orig_num_cols); convert_sparsity_to_csc(orig_num_rows, orig_row_ptrs, orig_col_idxs, trans_col_idxs, trans_row_ptrs + 1); From 8a89975c88d6f800837368b9b294deb33a3353c2 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 30 Nov 2021 18:08:00 +0100 Subject: [PATCH 10/32] unify convert_to(Dense) implementations --- common/unified/matrix/coo_kernels.cpp | 20 ++++ common/unified/matrix/ell_kernels.cpp | 27 ++++++ common/unified/matrix/sellp_kernels.cpp | 29 ++++++ core/device_hooks/common_kernels.inc.cpp | 11 +-- core/matrix/coo.cpp | 9 +- core/matrix/coo_kernels.hpp | 10 +- core/matrix/csr.cpp | 5 +- core/matrix/csr_kernels.hpp | 10 +- core/matrix/dense.cpp | 23 +++-- core/matrix/ell.cpp | 9 +- core/matrix/ell_kernels.hpp | 10 +- core/matrix/fbcsr.cpp | 9 +- core/matrix/fbcsr_kernels.hpp | 10 +- core/matrix/hybrid.cpp | 13 ++- core/matrix/hybrid_kernels.hpp | 7 -- core/matrix/sellp.cpp | 9 +- core/matrix/sellp_kernels.hpp | 10 +- cuda/matrix/coo_kernels.cu | 30 ------ cuda/matrix/csr_kernels.cu | 15 +-- cuda/matrix/ell_kernels.cu | 30 ------ cuda/matrix/fbcsr_kernels.cu | 14 +-- cuda/matrix/hybrid_kernels.cu | 9 -- cuda/matrix/sellp_kernels.cu | 44 --------- dpcpp/matrix/coo_kernels.dp.cpp | 116 ----------------------- dpcpp/matrix/csr_kernels.dp.cpp | 8 +- dpcpp/matrix/ell_kernels.dp.cpp | 70 -------------- dpcpp/matrix/fbcsr_kernels.dp.cpp | 8 +- dpcpp/matrix/hybrid_kernels.dp.cpp | 9 -- dpcpp/matrix/sellp_kernels.dp.cpp | 108 --------------------- hip/matrix/coo_kernels.hip.cpp | 32 ------- hip/matrix/csr_kernels.hip.cpp | 16 +--- hip/matrix/ell_kernels.hip.cpp | 32 ------- hip/matrix/fbcsr_kernels.hip.cpp | 8 +- hip/matrix/hybrid_kernels.hip.cpp | 9 -- hip/matrix/sellp_kernels.hip.cpp | 45 --------- include/ginkgo/core/matrix/dense.hpp | 13 +++ omp/matrix/coo_kernels.cpp | 26 ----- omp/matrix/csr_kernels.cpp | 8 +- omp/matrix/ell_kernels.cpp | 25 ----- omp/matrix/fbcsr_kernels.cpp | 9 +- omp/matrix/hybrid_kernels.cpp | 40 -------- omp/matrix/sellp_kernels.cpp | 36 ------- reference/matrix/coo_kernels.cpp | 15 +-- reference/matrix/csr_kernels.cpp | 11 +-- reference/matrix/ell_kernels.cpp | 11 +-- reference/matrix/fbcsr_kernels.cpp | 16 +--- reference/matrix/hybrid_kernels.cpp | 35 ------- reference/matrix/sellp_kernels.cpp | 11 +-- 48 files changed, 216 insertions(+), 864 deletions(-) diff --git a/common/unified/matrix/coo_kernels.cpp b/common/unified/matrix/coo_kernels.cpp index 530c8544e8f..3e6c66bb4e7 100644 --- a/common/unified/matrix/coo_kernels.cpp +++ b/common/unified/matrix/coo_kernels.cpp @@ -95,6 +95,26 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_COO_EXTRACT_DIAGONAL_KERNEL); +template +void fill_in_dense(std::shared_ptr exec, + const matrix::Coo* orig, + matrix::Dense* result) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto tidx, auto orig_values, auto orig_row_idxs, + auto orig_col_idxs, auto result) { + result(orig_row_idxs[tidx], orig_col_idxs[tidx]) = + orig_values[tidx]; + }, + orig->get_num_stored_elements(), orig->get_const_values(), + orig->get_const_row_idxs(), orig->get_const_col_idxs(), result); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_COO_FILL_IN_DENSE_KERNEL); + + } // namespace coo } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels diff --git a/common/unified/matrix/ell_kernels.cpp b/common/unified/matrix/ell_kernels.cpp index 196b113b0c9..d93e91ea18b 100644 --- a/common/unified/matrix/ell_kernels.cpp +++ b/common/unified/matrix/ell_kernels.cpp @@ -99,6 +99,33 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_ELL_FILL_IN_MATRIX_DATA_KERNEL); +template +void fill_in_dense(std::shared_ptr exec, + const matrix::Ell* source, + matrix::Dense* result) +{ + // TODO simplify once we can guarantee unique column indices outside padding + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto cols, auto ell_cols, auto ell_stride, + auto in_cols, auto in_vals, auto out) { + for (int64 ell_col = 0; ell_col < ell_cols; ell_col++) { + const auto ell_idx = ell_col * ell_stride + row; + const auto col = in_cols[ell_idx]; + const auto val = in_vals[ell_idx]; + out(row, col) += val; + } + }, + source->get_size()[0], source->get_size()[1], + source->get_num_stored_elements_per_row(), + static_cast(source->get_stride()), source->get_const_col_idxs(), + source->get_const_values(), result); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_ELL_FILL_IN_DENSE_KERNEL); + + } // namespace ell } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels diff --git a/common/unified/matrix/sellp_kernels.cpp b/common/unified/matrix/sellp_kernels.cpp index 517a4eed6e3..9a8fd2f687f 100644 --- a/common/unified/matrix/sellp_kernels.cpp +++ b/common/unified/matrix/sellp_kernels.cpp @@ -115,6 +115,35 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_SELLP_FILL_IN_MATRIX_DATA_KERNEL); +template +void fill_in_dense(std::shared_ptr exec, + const matrix::Sellp* source, + matrix::Dense* result) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto slice_size, auto slice_sets, auto cols, + auto values, auto result) { + const auto slice = row / slice_size; + const auto local_row = row % slice_size; + const auto slice_begin = slice_sets[slice]; + const auto slice_end = slice_sets[slice + 1]; + const auto slice_length = slice_end - slice_begin; + auto in_idx = slice_begin * slice_size + local_row; + for (int64 i = 0; i < slice_length; i++) { + result(row, cols[in_idx]) += values[in_idx]; + in_idx += slice_size; + } + }, + source->get_size()[0], source->get_slice_size(), + source->get_const_slice_sets(), source->get_const_col_idxs(), + source->get_const_values(), result); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_SELLP_FILL_IN_DENSE_KERNEL); + + } // namespace sellp } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index be571393c66..68d6e950983 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -467,7 +467,7 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEMM_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_FILL_IN_MATRIX_DATA_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CONVERT_TO_DENSE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CONVERT_TO_ELL_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CONVERT_TO_SELLP_KERNEL); @@ -508,7 +508,7 @@ namespace fbcsr { GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_SPMV_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_ADVANCED_SPMV_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_FILL_IN_MATRIX_DATA_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_CONVERT_TO_DENSE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_FILL_IN_DENSE_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_CONVERT_TO_CSR_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_TRANSPOSE_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_CONJ_TRANSPOSE_KERNEL); @@ -532,7 +532,7 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_ADVANCED_SPMV_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_SPMV2_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_ADVANCED_SPMV2_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_FILL_IN_MATRIX_DATA_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_CONVERT_TO_DENSE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_FILL_IN_DENSE_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_EXTRACT_DIAGONAL_KERNEL); @@ -546,7 +546,7 @@ GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_SPMV_KERNEL); GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_FILL_IN_MATRIX_DATA_KERNEL); GKO_STUB_INDEX_TYPE(GKO_DECLARE_ELL_COMPUTE_MAX_ROW_NNZ_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_CONVERT_TO_DENSE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_FILL_IN_DENSE_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_CONVERT_TO_CSR_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_COUNT_NONZEROS_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE( @@ -584,7 +584,6 @@ namespace hybrid { GKO_STUB(GKO_DECLARE_HYBRID_COMPUTE_ROW_NNZ); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_HYBRID_SPLIT_MATRIX_DATA_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_HYBRID_CONVERT_TO_DENSE_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_HYBRID_CONVERT_TO_CSR_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_HYBRID_COUNT_NONZEROS_KERNEL); @@ -599,7 +598,7 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_SPMV_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_FILL_IN_MATRIX_DATA_KERNEL); GKO_STUB(GKO_DECLARE_SELLP_COMPUTE_SLICE_SETS); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_ADVANCED_SPMV_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_CONVERT_TO_DENSE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_FILL_IN_DENSE_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_CONVERT_TO_CSR_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_COUNT_NONZEROS_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_EXTRACT_DIAGONAL_KERNEL); diff --git a/core/matrix/coo.cpp b/core/matrix/coo.cpp index f402c4835ee..c6041109610 100644 --- a/core/matrix/coo.cpp +++ b/core/matrix/coo.cpp @@ -64,7 +64,7 @@ GKO_REGISTER_OPERATION(spmv2, coo::spmv2); GKO_REGISTER_OPERATION(advanced_spmv2, coo::advanced_spmv2); GKO_REGISTER_OPERATION(fill_in_matrix_data, coo::fill_in_matrix_data); GKO_REGISTER_OPERATION(convert_idxs_to_ptrs, components::convert_idxs_to_ptrs); -GKO_REGISTER_OPERATION(convert_to_dense, coo::convert_to_dense); +GKO_REGISTER_OPERATION(fill_in_dense, coo::fill_in_dense); GKO_REGISTER_OPERATION(extract_diagonal, coo::extract_diagonal); GKO_REGISTER_OPERATION(fill_array, components::fill_array); GKO_REGISTER_OPERATION(inplace_absolute_array, @@ -183,9 +183,10 @@ template void Coo::convert_to(Dense* result) const { auto exec = this->get_executor(); - auto tmp = Dense::create(exec, this->get_size()); - exec->run(coo::make_convert_to_dense(this, tmp.get())); - tmp->move_to(result); + result->resize(this->get_size()); + result->fill(zero()); + exec->run(coo::make_fill_in_dense( + this, make_temporary_clone(exec, result).get())); } diff --git a/core/matrix/coo_kernels.hpp b/core/matrix/coo_kernels.hpp index 58171ce7f1c..552e073a643 100644 --- a/core/matrix/coo_kernels.hpp +++ b/core/matrix/coo_kernels.hpp @@ -81,10 +81,10 @@ namespace kernels { const Array>& data, \ matrix::Coo* output) -#define GKO_DECLARE_COO_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType) \ - void convert_to_dense(std::shared_ptr exec, \ - const matrix::Coo* source, \ - matrix::Dense* result) +#define GKO_DECLARE_COO_FILL_IN_DENSE_KERNEL(ValueType, IndexType) \ + void fill_in_dense(std::shared_ptr exec, \ + const matrix::Coo* source, \ + matrix::Dense* result) #define GKO_DECLARE_COO_EXTRACT_DIAGONAL_KERNEL(ValueType, IndexType) \ void extract_diagonal(std::shared_ptr exec, \ @@ -103,7 +103,7 @@ namespace kernels { template \ GKO_DECLARE_COO_FILL_IN_MATRIX_DATA_KERNEL(ValueType, IndexType); \ template \ - GKO_DECLARE_COO_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType); \ + GKO_DECLARE_COO_FILL_IN_DENSE_KERNEL(ValueType, IndexType); \ template \ GKO_DECLARE_COO_EXTRACT_DIAGONAL_KERNEL(ValueType, IndexType) diff --git a/core/matrix/csr.cpp b/core/matrix/csr.cpp index 4048e595279..dba59b3e58d 100644 --- a/core/matrix/csr.cpp +++ b/core/matrix/csr.cpp @@ -69,7 +69,7 @@ GKO_REGISTER_OPERATION(spgeam, csr::spgeam); GKO_REGISTER_OPERATION(build_row_ptrs, components::build_row_ptrs); GKO_REGISTER_OPERATION(fill_in_matrix_data, csr::fill_in_matrix_data); GKO_REGISTER_OPERATION(convert_ptrs_to_idxs, components::convert_ptrs_to_idxs); -GKO_REGISTER_OPERATION(convert_to_dense, csr::convert_to_dense); +GKO_REGISTER_OPERATION(fill_in_dense, csr::fill_in_dense); GKO_REGISTER_OPERATION(convert_to_sellp, csr::convert_to_sellp); GKO_REGISTER_OPERATION(calculate_total_cols, csr::calculate_total_cols); GKO_REGISTER_OPERATION(convert_to_ell, csr::convert_to_ell); @@ -206,7 +206,8 @@ void Csr::convert_to(Dense* result) const { auto exec = this->get_executor(); auto tmp = Dense::create(exec, this->get_size()); - exec->run(csr::make_convert_to_dense(this, tmp.get())); + tmp->fill(zero()); + exec->run(csr::make_fill_in_dense(this, tmp.get())); tmp->move_to(result); } diff --git a/core/matrix/csr_kernels.hpp b/core/matrix/csr_kernels.hpp index 97d9545a579..85334922a6b 100644 --- a/core/matrix/csr_kernels.hpp +++ b/core/matrix/csr_kernels.hpp @@ -97,10 +97,10 @@ namespace kernels { const Array>& data, \ matrix::Csr* output) -#define GKO_DECLARE_CSR_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType) \ - void convert_to_dense(std::shared_ptr exec, \ - const matrix::Csr* source, \ - matrix::Dense* result) +#define GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL(ValueType, IndexType) \ + void fill_in_dense(std::shared_ptr exec, \ + const matrix::Csr* source, \ + matrix::Dense* result) #define GKO_DECLARE_CSR_CONVERT_TO_ELL_KERNEL(ValueType, IndexType) \ void convert_to_ell(std::shared_ptr exec, \ @@ -225,7 +225,7 @@ namespace kernels { template \ GKO_DECLARE_CSR_FILL_IN_MATRIX_DATA_KERNEL(ValueType, IndexType); \ template \ - GKO_DECLARE_CSR_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType); \ + GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL(ValueType, IndexType); \ template \ GKO_DECLARE_CSR_CONVERT_TO_SELLP_KERNEL(ValueType, IndexType); \ template \ diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp index ee38b91f43e..be09de48dd3 100644 --- a/core/matrix/dense.cpp +++ b/core/matrix/dense.cpp @@ -672,14 +672,21 @@ void Dense::move_to(SparsityCsr* result) template -void Dense::read(const device_mat_data& data) +void Dense::resize(gko::dim<2> new_size) { - if (this->get_size() != data.size) { - this->set_size(data.size); - this->stride_ = data.size[1]; - this->values_.resize_and_reset(data.size[0] * this->get_stride()); + if (this->get_size() != new_size) { + this->set_size(new_size); + this->stride_ = new_size[1]; + this->values_.resize_and_reset(new_size[0] * this->get_stride()); } +} + + +template +void Dense::read(const device_mat_data& data) +{ auto exec = this->get_executor(); + this->resize(data.size); this->fill(zero()); exec->run(dense::make_fill_in_matrix_data( *make_temporary_clone(exec, &data.nonzeros), this)); @@ -689,12 +696,8 @@ void Dense::read(const device_mat_data& data) template void Dense::read(const device_mat_data32& data) { - if (this->get_size() != data.size) { - this->set_size(data.size); - this->stride_ = data.size[1]; - this->values_.resize_and_reset(data.size[0] * this->get_stride()); - } auto exec = this->get_executor(); + this->resize(data.size); this->fill(zero()); exec->run(dense::make_fill_in_matrix_data( *make_temporary_clone(exec, &data.nonzeros), this)); diff --git a/core/matrix/ell.cpp b/core/matrix/ell.cpp index 782e51523c5..e5c1ef0ddb6 100644 --- a/core/matrix/ell.cpp +++ b/core/matrix/ell.cpp @@ -63,7 +63,7 @@ GKO_REGISTER_OPERATION(advanced_spmv, ell::advanced_spmv); GKO_REGISTER_OPERATION(build_row_ptrs, components::build_row_ptrs); GKO_REGISTER_OPERATION(compute_max_row_nnz, ell::compute_max_row_nnz); GKO_REGISTER_OPERATION(fill_in_matrix_data, ell::fill_in_matrix_data); -GKO_REGISTER_OPERATION(convert_to_dense, ell::convert_to_dense); +GKO_REGISTER_OPERATION(fill_in_dense, ell::fill_in_dense); GKO_REGISTER_OPERATION(convert_to_csr, ell::convert_to_csr); GKO_REGISTER_OPERATION(count_nonzeros, ell::count_nonzeros); GKO_REGISTER_OPERATION(calculate_nonzeros_per_row, @@ -157,9 +157,10 @@ template void Ell::convert_to(Dense* result) const { auto exec = this->get_executor(); - auto tmp = Dense::create(exec, this->get_size()); - exec->run(ell::make_convert_to_dense(this, tmp.get())); - tmp->move_to(result); + result->resize(this->get_size()); + result->fill(zero()); + exec->run(ell::make_fill_in_dense( + this, make_temporary_clone(exec, result).get())); } diff --git a/core/matrix/ell_kernels.hpp b/core/matrix/ell_kernels.hpp index fe179e6dd52..e7c3eaa232c 100644 --- a/core/matrix/ell_kernels.hpp +++ b/core/matrix/ell_kernels.hpp @@ -76,10 +76,10 @@ namespace kernels { const Array>& data, \ const int64* row_ptrs, matrix::Ell* output) -#define GKO_DECLARE_ELL_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType) \ - void convert_to_dense(std::shared_ptr exec, \ - const matrix::Ell* source, \ - matrix::Dense* result) +#define GKO_DECLARE_ELL_FILL_IN_DENSE_KERNEL(ValueType, IndexType) \ + void fill_in_dense(std::shared_ptr exec, \ + const matrix::Ell* source, \ + matrix::Dense* result) #define GKO_DECLARE_ELL_CONVERT_TO_CSR_KERNEL(ValueType, IndexType) \ void convert_to_csr(std::shared_ptr exec, \ @@ -117,7 +117,7 @@ namespace kernels { template \ GKO_DECLARE_ELL_FILL_IN_MATRIX_DATA_KERNEL(ValueType, IndexType); \ template \ - GKO_DECLARE_ELL_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType); \ + GKO_DECLARE_ELL_FILL_IN_DENSE_KERNEL(ValueType, IndexType); \ template \ GKO_DECLARE_ELL_CONVERT_TO_CSR_KERNEL(ValueType, IndexType); \ template \ diff --git a/core/matrix/fbcsr.cpp b/core/matrix/fbcsr.cpp index a80b0cfaceb..867f13654d9 100644 --- a/core/matrix/fbcsr.cpp +++ b/core/matrix/fbcsr.cpp @@ -68,7 +68,7 @@ GKO_REGISTER_OPERATION(spmv, fbcsr::spmv); GKO_REGISTER_OPERATION(advanced_spmv, fbcsr::advanced_spmv); GKO_REGISTER_OPERATION(fill_in_matrix_data, fbcsr::fill_in_matrix_data); GKO_REGISTER_OPERATION(convert_to_csr, fbcsr::convert_to_csr); -GKO_REGISTER_OPERATION(convert_to_dense, fbcsr::convert_to_dense); +GKO_REGISTER_OPERATION(fill_in_dense, fbcsr::fill_in_dense); GKO_REGISTER_OPERATION(transpose, fbcsr::transpose); GKO_REGISTER_OPERATION(conj_transpose, fbcsr::conj_transpose); GKO_REGISTER_OPERATION(calculate_max_nnz_per_row, @@ -159,9 +159,10 @@ void Fbcsr::convert_to( Dense* const result) const { auto exec = this->get_executor(); - auto tmp = Dense::create(exec, this->get_size()); - exec->run(fbcsr::make_convert_to_dense(this, tmp.get())); - tmp->move_to(result); + result->resize(this->get_size()); + result->fill(zero()); + exec->run(fbcsr::make_fill_in_dense( + this, make_temporary_clone(exec, result).get())); } diff --git a/core/matrix/fbcsr_kernels.hpp b/core/matrix/fbcsr_kernels.hpp index 8cc74a7f2a2..fc96fe17d86 100644 --- a/core/matrix/fbcsr_kernels.hpp +++ b/core/matrix/fbcsr_kernels.hpp @@ -72,10 +72,10 @@ namespace kernels { int block_size, Array& row_ptrs, \ Array& col_idxs, Array& values) -#define GKO_DECLARE_FBCSR_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType) \ - void convert_to_dense(std::shared_ptr exec, \ - const matrix::Fbcsr* source, \ - matrix::Dense* result) +#define GKO_DECLARE_FBCSR_FILL_IN_DENSE_KERNEL(ValueType, IndexType) \ + void fill_in_dense(std::shared_ptr exec, \ + const matrix::Fbcsr* source, \ + matrix::Dense* result) #define GKO_DECLARE_FBCSR_CONVERT_TO_CSR_KERNEL(ValueType, IndexType) \ void convert_to_csr(std::shared_ptr exec, \ @@ -127,7 +127,7 @@ namespace kernels { template \ GKO_DECLARE_FBCSR_FILL_IN_MATRIX_DATA_KERNEL(ValueType, IndexType); \ template \ - GKO_DECLARE_FBCSR_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType); \ + GKO_DECLARE_FBCSR_FILL_IN_DENSE_KERNEL(ValueType, IndexType); \ template \ GKO_DECLARE_FBCSR_CONVERT_TO_CSR_KERNEL(ValueType, IndexType); \ template \ diff --git a/core/matrix/hybrid.cpp b/core/matrix/hybrid.cpp index 67b0bcfe085..890cdec2251 100644 --- a/core/matrix/hybrid.cpp +++ b/core/matrix/hybrid.cpp @@ -62,7 +62,8 @@ namespace { GKO_REGISTER_OPERATION(build_row_ptrs, components::build_row_ptrs); GKO_REGISTER_OPERATION(compute_row_nnz, hybrid::compute_row_nnz); GKO_REGISTER_OPERATION(split_matrix_data, hybrid::split_matrix_data); -GKO_REGISTER_OPERATION(convert_to_dense, hybrid::convert_to_dense); +GKO_REGISTER_OPERATION(ell_fill_in_dense, ell::fill_in_dense); +GKO_REGISTER_OPERATION(coo_fill_in_dense, coo::fill_in_dense); GKO_REGISTER_OPERATION(convert_to_csr, hybrid::convert_to_csr); GKO_REGISTER_OPERATION(count_nonzeros, hybrid::count_nonzeros); GKO_REGISTER_OPERATION(extract_coo_diagonal, coo::extract_diagonal); @@ -133,9 +134,13 @@ template void Hybrid::convert_to(Dense* result) const { auto exec = this->get_executor(); - auto tmp = Dense::create(exec, this->get_size()); - exec->run(hybrid::make_convert_to_dense(this, tmp.get())); - tmp->move_to(result); + result->resize(this->get_size()); + result->fill(zero()); + auto result_local = make_temporary_clone(exec, result); + exec->run( + hybrid::make_ell_fill_in_dense(this->get_ell(), result_local.get())); + exec->run( + hybrid::make_coo_fill_in_dense(this->get_coo(), result_local.get())); } diff --git a/core/matrix/hybrid_kernels.hpp b/core/matrix/hybrid_kernels.hpp index e0e4656aed3..fddab172cd8 100644 --- a/core/matrix/hybrid_kernels.hpp +++ b/core/matrix/hybrid_kernels.hpp @@ -59,11 +59,6 @@ namespace kernels { Array>& ell_data, \ Array>& coo_data) -#define GKO_DECLARE_HYBRID_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType) \ - void convert_to_dense(std::shared_ptr exec, \ - const matrix::Hybrid* source, \ - matrix::Dense* result) - #define GKO_DECLARE_HYBRID_CONVERT_TO_CSR_KERNEL(ValueType, IndexType) \ void convert_to_csr(std::shared_ptr exec, \ const matrix::Hybrid* source, \ @@ -79,8 +74,6 @@ namespace kernels { template \ GKO_DECLARE_HYBRID_SPLIT_MATRIX_DATA_KERNEL(ValueType, IndexType); \ template \ - GKO_DECLARE_HYBRID_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType); \ - template \ GKO_DECLARE_HYBRID_CONVERT_TO_CSR_KERNEL(ValueType, IndexType); \ template \ GKO_DECLARE_HYBRID_COUNT_NONZEROS_KERNEL(ValueType, IndexType) diff --git a/core/matrix/sellp.cpp b/core/matrix/sellp.cpp index 0dfb8dc0ff5..ad83738c25a 100644 --- a/core/matrix/sellp.cpp +++ b/core/matrix/sellp.cpp @@ -60,7 +60,7 @@ GKO_REGISTER_OPERATION(advanced_spmv, sellp::advanced_spmv); GKO_REGISTER_OPERATION(build_row_ptrs, components::build_row_ptrs); GKO_REGISTER_OPERATION(compute_slice_sets, sellp::compute_slice_sets); GKO_REGISTER_OPERATION(fill_in_matrix_data, sellp::fill_in_matrix_data); -GKO_REGISTER_OPERATION(convert_to_dense, sellp::convert_to_dense); +GKO_REGISTER_OPERATION(fill_in_dense, sellp::fill_in_dense); GKO_REGISTER_OPERATION(convert_to_csr, sellp::convert_to_csr); GKO_REGISTER_OPERATION(count_nonzeros, sellp::count_nonzeros); GKO_REGISTER_OPERATION(extract_diagonal, sellp::extract_diagonal); @@ -125,9 +125,10 @@ template void Sellp::convert_to(Dense* result) const { auto exec = this->get_executor(); - auto tmp = Dense::create(exec, this->get_size()); - exec->run(sellp::make_convert_to_dense(this, tmp.get())); - tmp->move_to(result); + result->resize(this->get_size()); + result->fill(zero()); + exec->run(sellp::make_fill_in_dense( + this, make_temporary_clone(exec, result).get())); } diff --git a/core/matrix/sellp_kernels.hpp b/core/matrix/sellp_kernels.hpp index e02b145b84d..39d8a5007b5 100644 --- a/core/matrix/sellp_kernels.hpp +++ b/core/matrix/sellp_kernels.hpp @@ -74,10 +74,10 @@ namespace kernels { size_type slice_size, size_type stride_factor, \ size_type* slice_sets, size_type* slice_lengths) -#define GKO_DECLARE_SELLP_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType) \ - void convert_to_dense(std::shared_ptr exec, \ - const matrix::Sellp* source, \ - matrix::Dense* result) +#define GKO_DECLARE_SELLP_FILL_IN_DENSE_KERNEL(ValueType, IndexType) \ + void fill_in_dense(std::shared_ptr exec, \ + const matrix::Sellp* source, \ + matrix::Dense* result) #define GKO_DECLARE_SELLP_CONVERT_TO_CSR_KERNEL(ValueType, IndexType) \ void convert_to_csr(std::shared_ptr exec, \ @@ -103,7 +103,7 @@ namespace kernels { GKO_DECLARE_SELLP_FILL_IN_MATRIX_DATA_KERNEL(ValueType, IndexType); \ GKO_DECLARE_SELLP_COMPUTE_SLICE_SETS; \ template \ - GKO_DECLARE_SELLP_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType); \ + GKO_DECLARE_SELLP_FILL_IN_DENSE_KERNEL(ValueType, IndexType); \ template \ GKO_DECLARE_SELLP_CONVERT_TO_CSR_KERNEL(ValueType, IndexType); \ template \ diff --git a/cuda/matrix/coo_kernels.cu b/cuda/matrix/coo_kernels.cu index f5989b51d99..0c7540ca7b1 100644 --- a/cuda/matrix/coo_kernels.cu +++ b/cuda/matrix/coo_kernels.cu @@ -181,36 +181,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_COO_ADVANCED_SPMV2_KERNEL); -template -void convert_to_dense(std::shared_ptr exec, - const matrix::Coo* source, - matrix::Dense* result) -{ - const auto num_rows = result->get_size()[0]; - const auto num_cols = result->get_size()[1]; - const auto stride = result->get_stride(); - - const auto nnz = source->get_num_stored_elements(); - - const dim3 block_size(config::warp_size, - config::max_block_size / config::warp_size, 1); - const dim3 init_grid_dim(ceildiv(num_cols, block_size.x), - ceildiv(num_rows, block_size.y), 1); - kernel::initialize_zero_dense<<>>( - num_rows, num_cols, stride, as_cuda_type(result->get_values())); - - const auto grid_dim = ceildiv(nnz, default_block_size); - kernel::fill_in_dense<<>>( - nnz, as_cuda_type(source->get_const_row_idxs()), - as_cuda_type(source->get_const_col_idxs()), - as_cuda_type(source->get_const_values()), stride, - as_cuda_type(result->get_values())); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_COO_CONVERT_TO_DENSE_KERNEL); - - } // namespace coo } // namespace cuda } // namespace kernels diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.cu index 9c816d08902..ab72c44a018 100644 --- a/cuda/matrix/csr_kernels.cu +++ b/cuda/matrix/csr_kernels.cu @@ -848,9 +848,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL); template -void convert_to_dense(std::shared_ptr exec, - const matrix::Csr* source, - matrix::Dense* result) +void fill_in_dense(std::shared_ptr exec, + const matrix::Csr* source, + matrix::Dense* result) { const auto num_rows = result->get_size()[0]; const auto num_cols = result->get_size()[1]; @@ -859,13 +859,6 @@ void convert_to_dense(std::shared_ptr exec, const auto col_idxs = source->get_const_col_idxs(); const auto vals = source->get_const_values(); - const dim3 block_size(config::warp_size, - config::max_block_size / config::warp_size, 1); - const dim3 init_grid_dim(ceildiv(num_cols, block_size.x), - ceildiv(num_rows, block_size.y), 1); - kernel::initialize_zero_dense<<>>( - num_rows, num_cols, stride, as_cuda_type(result->get_values())); - auto grid_dim = ceildiv(num_rows, default_block_size); kernel::fill_in_dense<<>>( num_rows, as_cuda_type(row_ptrs), as_cuda_type(col_idxs), @@ -873,7 +866,7 @@ void convert_to_dense(std::shared_ptr exec, } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CONVERT_TO_DENSE_KERNEL); + GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL); template diff --git a/cuda/matrix/ell_kernels.cu b/cuda/matrix/ell_kernels.cu index dccdcb48074..d935a56c905 100644 --- a/cuda/matrix/ell_kernels.cu +++ b/cuda/matrix/ell_kernels.cu @@ -287,36 +287,6 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL); -template -void convert_to_dense(std::shared_ptr exec, - const matrix::Ell* source, - matrix::Dense* result) -{ - const auto num_rows = result->get_size()[0]; - const auto num_cols = result->get_size()[1]; - const auto result_stride = result->get_stride(); - const auto col_idxs = source->get_const_col_idxs(); - const auto vals = source->get_const_values(); - const auto source_stride = source->get_stride(); - - const dim3 block_size(config::warp_size, - config::max_block_size / config::warp_size, 1); - const dim3 init_grid_dim(ceildiv(num_cols, block_size.x), - ceildiv(num_rows, block_size.y), 1); - kernel::initialize_zero_dense<<>>( - num_rows, num_cols, result_stride, as_cuda_type(result->get_values())); - - const auto grid_dim = ceildiv(num_rows, default_block_size); - kernel::fill_in_dense<<>>( - num_rows, source->get_num_stored_elements_per_row(), source_stride, - as_cuda_type(col_idxs), as_cuda_type(vals), result_stride, - as_cuda_type(result->get_values())); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_ELL_CONVERT_TO_DENSE_KERNEL); - - template void convert_to_csr(std::shared_ptr exec, const matrix::Ell* source, diff --git a/cuda/matrix/fbcsr_kernels.cu b/cuda/matrix/fbcsr_kernels.cu index ada8752a22d..eebec3cc574 100644 --- a/cuda/matrix/fbcsr_kernels.cu +++ b/cuda/matrix/fbcsr_kernels.cu @@ -227,19 +227,13 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_FBCSR_ADVANCED_SPMV_KERNEL); -template -void convert_row_ptrs_to_idxs(std::shared_ptr exec, - const IndexType* ptrs, size_type num_rows, - IndexType* idxs) GKO_NOT_IMPLEMENTED; - - template -void convert_to_dense(std::shared_ptr exec, - const matrix::Fbcsr* source, - matrix::Dense* result) GKO_NOT_IMPLEMENTED; +void fill_in_dense(std::shared_ptr exec, + const matrix::Fbcsr* source, + matrix::Dense* result) GKO_NOT_IMPLEMENTED; GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_FBCSR_CONVERT_TO_DENSE_KERNEL); + GKO_DECLARE_FBCSR_FILL_IN_DENSE_KERNEL); template diff --git a/cuda/matrix/hybrid_kernels.cu b/cuda/matrix/hybrid_kernels.cu index 7514f4a6640..9a15798bf6a 100644 --- a/cuda/matrix/hybrid_kernels.cu +++ b/cuda/matrix/hybrid_kernels.cu @@ -78,15 +78,6 @@ constexpr int warps_in_block = 4; #include "common/cuda_hip/matrix/hybrid_kernels.hpp.inc" -template -void convert_to_dense(std::shared_ptr exec, - const matrix::Hybrid* source, - matrix::Dense* result) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_HYBRID_CONVERT_TO_DENSE_KERNEL); - - template void convert_to_csr(std::shared_ptr exec, const matrix::Hybrid* source, diff --git a/cuda/matrix/sellp_kernels.cu b/cuda/matrix/sellp_kernels.cu index b33585e6ffc..a37f673c549 100644 --- a/cuda/matrix/sellp_kernels.cu +++ b/cuda/matrix/sellp_kernels.cu @@ -109,50 +109,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_SELLP_ADVANCED_SPMV_KERNEL); -template -void convert_to_dense(std::shared_ptr exec, - const matrix::Sellp* source, - matrix::Dense* result) -{ - const auto num_rows = source->get_size()[0]; - const auto num_cols = source->get_size()[1]; - const auto vals = source->get_const_values(); - const auto col_idxs = source->get_const_col_idxs(); - const auto slice_lengths = source->get_const_slice_lengths(); - const auto slice_sets = source->get_const_slice_sets(); - const auto slice_size = source->get_slice_size(); - - const auto slice_num = ceildiv(num_rows, slice_size); - - const dim3 block_size(config::warp_size, - config::max_block_size / config::warp_size, 1); - const dim3 init_grid_dim(ceildiv(num_cols, block_size.x), - ceildiv(num_rows, block_size.y), 1); - - if (num_rows > 0 && result->get_stride() > 0) { - kernel::initialize_zero_dense<<>>( - num_rows, num_cols, result->get_stride(), - as_cuda_type(result->get_values())); - } - - constexpr auto threads_per_row = config::warp_size; - const auto grid_dim = - ceildiv(slice_size * slice_num * threads_per_row, default_block_size); - - if (grid_dim > 0) { - kernel::fill_in_dense - <<>>( - num_rows, num_cols, result->get_stride(), slice_size, - as_cuda_type(slice_lengths), as_cuda_type(slice_sets), - as_cuda_type(col_idxs), as_cuda_type(vals), - as_cuda_type(result->get_values())); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_SELLP_CONVERT_TO_DENSE_KERNEL); - - template void convert_to_csr(std::shared_ptr exec, const matrix::Sellp* source, diff --git a/dpcpp/matrix/coo_kernels.dp.cpp b/dpcpp/matrix/coo_kernels.dp.cpp index 55f80e8109c..d06a432c920 100644 --- a/dpcpp/matrix/coo_kernels.dp.cpp +++ b/dpcpp/matrix/coo_kernels.dp.cpp @@ -281,92 +281,6 @@ GKO_ENABLE_DEFAULT_HOST(abstract_spmm, abstract_spmm); } // namespace -namespace kernel { - - -template -void convert_row_idxs_to_ptrs(const IndexType* __restrict__ idxs, - size_type num_nonzeros, - IndexType* __restrict__ ptrs, size_type length, - sycl::nd_item<3> item_ct1) -{ - const auto tidx = thread::get_thread_id_flat(item_ct1); - - if (tidx == 0) { - ptrs[0] = 0; - ptrs[length - 1] = num_nonzeros; - } - - if (0 < tidx && tidx < num_nonzeros) { - if (idxs[tidx - 1] < idxs[tidx]) { - for (auto i = idxs[tidx - 1] + 1; i <= idxs[tidx]; i++) { - ptrs[i] = tidx; - } - } - } -} - -// can not use GKO_ENABLE_DEFAULT_HOST, otherwise we can not inistantiate it. -template -void convert_row_idxs_to_ptrs(dim3 grid, dim3 block, - size_type dynamic_shared_memory, - sycl::queue* queue, const IndexType* idxs, - size_type num_nonzeros, IndexType* ptrs, - size_type length) -{ - queue->submit([&](sycl::handler& cgh) { - cgh.parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) { - convert_row_idxs_to_ptrs(idxs, num_nonzeros, ptrs, - length, item_ct1); - }); - }); -} - -template void convert_row_idxs_to_ptrs(dim3, dim3, size_type, sycl::queue*, - const int32* idxs, size_type, int32*, - size_type); -template void convert_row_idxs_to_ptrs(dim3, dim3, size_type, sycl::queue*, - const int64* idxs, size_type, int64*, - size_type); - -template -void initialize_zero_dense(size_type num_rows, size_type num_cols, - size_type stride, ValueType* __restrict__ result, - sycl::nd_item<3> item_ct1) -{ - const auto tidx_x = - item_ct1.get_local_id(2) + - item_ct1.get_local_range().get(2) * item_ct1.get_group(2); - const auto tidx_y = - item_ct1.get_local_id(1) + - item_ct1.get_local_range().get(1) * item_ct1.get_group(1); - if (tidx_x < num_cols && tidx_y < num_rows) { - result[tidx_y * stride + tidx_x] = zero(); - } -} - -GKO_ENABLE_DEFAULT_HOST(initialize_zero_dense, initialize_zero_dense); - - -template -void fill_in_dense(size_type nnz, const IndexType* __restrict__ row_idxs, - const IndexType* __restrict__ col_idxs, - const ValueType* __restrict__ values, size_type stride, - ValueType* __restrict__ result, sycl::nd_item<3> item_ct1) -{ - const auto tidx = thread::get_thread_id_flat(item_ct1); - if (tidx < nnz) { - result[stride * row_idxs[tidx] + col_idxs[tidx]] = values[tidx]; - } -} - -GKO_ENABLE_DEFAULT_HOST(fill_in_dense, fill_in_dense); - - -} // namespace kernel - - template void spmv(std::shared_ptr exec, const matrix::Coo* a, @@ -471,36 +385,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_COO_ADVANCED_SPMV2_KERNEL); -template -void convert_to_dense(std::shared_ptr exec, - const matrix::Coo* source, - matrix::Dense* result) -{ - const auto num_rows = result->get_size()[0]; - const auto num_cols = result->get_size()[1]; - const auto stride = result->get_stride(); - - const auto nnz = source->get_num_stored_elements(); - - const dim3 block_size(config::warp_size, - config::max_block_size / config::warp_size, 1); - const dim3 init_grid_dim(ceildiv(num_cols, block_size.x), - ceildiv(num_rows, block_size.y), 1); - kernel::initialize_zero_dense(init_grid_dim, block_size, 0, - exec->get_queue(), num_rows, num_cols, stride, - result->get_values()); - - const auto grid_dim = ceildiv(nnz, default_block_size); - kernel::fill_in_dense( - grid_dim, default_block_size, 0, exec->get_queue(), nnz, - source->get_const_row_idxs(), source->get_const_col_idxs(), - source->get_const_values(), stride, result->get_values()); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_COO_CONVERT_TO_DENSE_KERNEL); - - } // namespace coo } // namespace dpcpp } // namespace kernels diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp index 5cb9757f27b..282f55d1e70 100644 --- a/dpcpp/matrix/csr_kernels.dp.cpp +++ b/dpcpp/matrix/csr_kernels.dp.cpp @@ -2225,9 +2225,9 @@ void convert_row_ptrs_to_idxs(std::shared_ptr exec, template -void convert_to_dense(std::shared_ptr exec, - const matrix::Csr* source, - matrix::Dense* result) +void fill_in_dense(std::shared_ptr exec, + const matrix::Csr* source, + matrix::Dense* result) { const auto num_rows = result->get_size()[0]; const auto num_cols = result->get_size()[1]; @@ -2251,7 +2251,7 @@ void convert_to_dense(std::shared_ptr exec, } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CONVERT_TO_DENSE_KERNEL); + GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL); template diff --git a/dpcpp/matrix/ell_kernels.dp.cpp b/dpcpp/matrix/ell_kernels.dp.cpp index 31412fb2fdb..b22873e6c74 100644 --- a/dpcpp/matrix/ell_kernels.dp.cpp +++ b/dpcpp/matrix/ell_kernels.dp.cpp @@ -300,45 +300,6 @@ void spmv(dim3 grid, dim3 block, size_type dynamic_shared_memory, } // namespace -template -void initialize_zero_dense(size_type num_rows, size_type num_cols, - size_type stride, ValueType* __restrict__ result, - sycl::nd_item<3> item_ct1) -{ - const auto tidx_x = - item_ct1.get_local_id(2) + - item_ct1.get_local_range().get(2) * item_ct1.get_group(2); - const auto tidx_y = - item_ct1.get_local_id(1) + - item_ct1.get_local_range().get(1) * item_ct1.get_group(1); - if (tidx_x < num_cols && tidx_y < num_rows) { - result[tidx_y * stride + tidx_x] = zero(); - } -} - -GKO_ENABLE_DEFAULT_HOST(initialize_zero_dense, initialize_zero_dense); - - -template -void fill_in_dense(size_type num_rows, size_type nnz, size_type source_stride, - const IndexType* __restrict__ col_idxs, - const ValueType* __restrict__ values, - size_type result_stride, ValueType* __restrict__ result, - sycl::nd_item<3> item_ct1) -{ - const auto tidx = thread::get_thread_id_flat(item_ct1); - if (tidx < num_rows) { - for (size_type col = 0; col < nnz; col++) { - result[tidx * result_stride + - col_idxs[tidx + col * source_stride]] += - values[tidx + col * source_stride]; - } - } -} - -GKO_ENABLE_DEFAULT_HOST(fill_in_dense, fill_in_dense); - - template void count_nnz_per_row(size_type num_rows, size_type max_nnz_per_row, size_type stride, const ValueType* __restrict__ values, @@ -618,37 +579,6 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL); -template -void convert_to_dense(std::shared_ptr exec, - const matrix::Ell* source, - matrix::Dense* result) -{ - const auto num_rows = result->get_size()[0]; - const auto num_cols = result->get_size()[1]; - const auto result_stride = result->get_stride(); - const auto col_idxs = source->get_const_col_idxs(); - const auto vals = source->get_const_values(); - const auto source_stride = source->get_stride(); - - const dim3 block_size(config::warp_size, - config::max_block_size / config::warp_size, 1); - const dim3 init_grid_dim(ceildiv(num_cols, block_size.x), - ceildiv(num_rows, block_size.y), 1); - kernel::initialize_zero_dense(init_grid_dim, block_size, 0, - exec->get_queue(), num_rows, num_cols, - result_stride, result->get_values()); - - const auto grid_dim = ceildiv(num_rows, default_block_size); - kernel::fill_in_dense(grid_dim, default_block_size, 0, exec->get_queue(), - num_rows, source->get_num_stored_elements_per_row(), - source_stride, col_idxs, vals, result_stride, - result->get_values()); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_ELL_CONVERT_TO_DENSE_KERNEL); - - template void convert_to_csr(std::shared_ptr exec, const matrix::Ell* source, diff --git a/dpcpp/matrix/fbcsr_kernels.dp.cpp b/dpcpp/matrix/fbcsr_kernels.dp.cpp index e2fb12850a7..d1d1707ea18 100644 --- a/dpcpp/matrix/fbcsr_kernels.dp.cpp +++ b/dpcpp/matrix/fbcsr_kernels.dp.cpp @@ -96,12 +96,12 @@ void convert_row_ptrs_to_idxs(std::shared_ptr exec, template -void convert_to_dense(std::shared_ptr exec, - const matrix::Fbcsr* source, - matrix::Dense* result) GKO_NOT_IMPLEMENTED; +void fill_in_dense(std::shared_ptr exec, + const matrix::Fbcsr* source, + matrix::Dense* result) GKO_NOT_IMPLEMENTED; GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_FBCSR_CONVERT_TO_DENSE_KERNEL); + GKO_DECLARE_FBCSR_FILL_IN_DENSE_KERNEL); template diff --git a/dpcpp/matrix/hybrid_kernels.dp.cpp b/dpcpp/matrix/hybrid_kernels.dp.cpp index 51a10c321ad..223eb0a6266 100644 --- a/dpcpp/matrix/hybrid_kernels.dp.cpp +++ b/dpcpp/matrix/hybrid_kernels.dp.cpp @@ -218,15 +218,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_HYBRID_SPLIT_MATRIX_DATA_KERNEL); -template -void convert_to_dense(std::shared_ptr exec, - const matrix::Hybrid* source, - matrix::Dense* result) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_HYBRID_CONVERT_TO_DENSE_KERNEL); - - template void convert_to_csr(std::shared_ptr exec, const matrix::Hybrid* source, diff --git a/dpcpp/matrix/sellp_kernels.dp.cpp b/dpcpp/matrix/sellp_kernels.dp.cpp index 5e596b5bf53..8a0961533b6 100644 --- a/dpcpp/matrix/sellp_kernels.dp.cpp +++ b/dpcpp/matrix/sellp_kernels.dp.cpp @@ -138,72 +138,6 @@ GKO_ENABLE_DEFAULT_HOST(advanced_spmv_kernel, advanced_spmv_kernel); namespace kernel { -template -void initialize_zero_dense(size_type num_rows, size_type num_cols, - size_type stride, ValueType* __restrict__ result, - sycl::nd_item<3> item_ct1) -{ - const auto tidx_x = - item_ct1.get_local_id(2) + - item_ct1.get_local_range().get(2) * item_ct1.get_group(2); - const auto tidx_y = - item_ct1.get_local_id(1) + - item_ct1.get_local_range().get(1) * item_ct1.get_group(1); - if (tidx_x < num_cols && tidx_y < num_rows) { - result[tidx_y * stride + tidx_x] = zero(); - } -} - -GKO_ENABLE_DEFAULT_HOST(initialize_zero_dense, initialize_zero_dense); - - -template -void fill_in_dense(size_type num_rows, size_type num_cols, size_type stride, - size_type slice_size, - const size_type* __restrict__ slice_lengths, - const size_type* __restrict__ slice_sets, - const IndexType* __restrict__ col_idxs, - const ValueType* __restrict__ values, - ValueType* __restrict__ result, sycl::nd_item<3> item_ct1) -{ - const auto global_row = - thread::get_subwarp_id_flat(item_ct1); - const auto row = global_row % slice_size; - const auto slice = global_row / slice_size; - const auto start_index = item_ct1.get_local_id(2) % threads_per_row; - - if (global_row < num_rows) { - for (auto i = start_index; i < slice_lengths[slice]; - i += threads_per_row) { - if (values[(slice_sets[slice] + i) * slice_size + row] != - zero()) { - result[global_row * stride + - col_idxs[(slice_sets[slice] + i) * slice_size + row]] = - values[(slice_sets[slice] + i) * slice_size + row]; - } - } - } -} - -template -void fill_in_dense(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, size_type num_rows, size_type num_cols, - size_type stride, size_type slice_size, - const size_type* slice_lengths, const size_type* slice_sets, - const IndexType* col_idxs, const ValueType* values, - ValueType* result) -{ - queue->submit([&](sycl::handler& cgh) { - cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - fill_in_dense( - num_rows, num_cols, stride, slice_size, slice_lengths, - slice_sets, col_idxs, values, result, item_ct1); - }); - }); -} - - template void count_nnz_per_row(size_type num_rows, size_type slice_size, const size_type* __restrict__ slice_sets, @@ -354,48 +288,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_SELLP_ADVANCED_SPMV_KERNEL); -template -void convert_to_dense(std::shared_ptr exec, - const matrix::Sellp* source, - matrix::Dense* result) -{ - const auto num_rows = source->get_size()[0]; - const auto num_cols = source->get_size()[1]; - const auto vals = source->get_const_values(); - const auto col_idxs = source->get_const_col_idxs(); - const auto slice_lengths = source->get_const_slice_lengths(); - const auto slice_sets = source->get_const_slice_sets(); - const auto slice_size = source->get_slice_size(); - - const auto slice_num = ceildiv(num_rows, slice_size); - - const dim3 block_size(config::warp_size, - config::max_block_size / config::warp_size, 1); - const dim3 init_grid_dim(ceildiv(num_cols, block_size.x), - ceildiv(num_rows, block_size.y), 1); - - if (num_rows > 0 && result->get_stride() > 0) { - kernel::initialize_zero_dense( - init_grid_dim, block_size, 0, exec->get_queue(), num_rows, num_cols, - result->get_stride(), result->get_values()); - } - - constexpr auto threads_per_row = config::warp_size; - const auto grid_dim = - ceildiv(slice_size * slice_num * threads_per_row, default_block_size); - - if (grid_dim > 0) { - kernel::fill_in_dense( - grid_dim, default_block_size, 0, exec->get_queue(), num_rows, - num_cols, result->get_stride(), slice_size, slice_lengths, - slice_sets, col_idxs, vals, result->get_values()); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_SELLP_CONVERT_TO_DENSE_KERNEL); - - template void convert_to_csr(std::shared_ptr exec, const matrix::Sellp* source, diff --git a/hip/matrix/coo_kernels.hip.cpp b/hip/matrix/coo_kernels.hip.cpp index babb8dbc096..3e25f713397 100644 --- a/hip/matrix/coo_kernels.hip.cpp +++ b/hip/matrix/coo_kernels.hip.cpp @@ -190,38 +190,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_COO_ADVANCED_SPMV2_KERNEL); -template -void convert_to_dense(std::shared_ptr exec, - const matrix::Coo* source, - matrix::Dense* result) -{ - const auto num_rows = result->get_size()[0]; - const auto num_cols = result->get_size()[1]; - const auto stride = result->get_stride(); - - const auto nnz = source->get_num_stored_elements(); - - const dim3 block_size(config::warp_size, - config::max_block_size / config::warp_size, 1); - const dim3 init_grid_dim(ceildiv(num_cols, block_size.x), - ceildiv(num_rows, block_size.y), 1); - hipLaunchKernelGGL(kernel::initialize_zero_dense, dim3(init_grid_dim), - dim3(block_size), 0, 0, num_rows, num_cols, stride, - as_hip_type(result->get_values())); - - const auto grid_dim = ceildiv(nnz, default_block_size); - hipLaunchKernelGGL(kernel::fill_in_dense, dim3(grid_dim), - dim3(default_block_size), 0, 0, nnz, - as_hip_type(source->get_const_row_idxs()), - as_hip_type(source->get_const_col_idxs()), - as_hip_type(source->get_const_values()), stride, - as_hip_type(result->get_values())); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_COO_CONVERT_TO_DENSE_KERNEL); - - } // namespace coo } // namespace hip } // namespace kernels diff --git a/hip/matrix/csr_kernels.hip.cpp b/hip/matrix/csr_kernels.hip.cpp index f3e3f133d90..cc60837eb3f 100644 --- a/hip/matrix/csr_kernels.hip.cpp +++ b/hip/matrix/csr_kernels.hip.cpp @@ -697,9 +697,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL); template -void convert_to_dense(std::shared_ptr exec, - const matrix::Csr* source, - matrix::Dense* result) +void fill_in_dense(std::shared_ptr exec, + const matrix::Csr* source, + matrix::Dense* result) { const auto num_rows = result->get_size()[0]; const auto num_cols = result->get_size()[1]; @@ -708,14 +708,6 @@ void convert_to_dense(std::shared_ptr exec, const auto col_idxs = source->get_const_col_idxs(); const auto vals = source->get_const_values(); - const dim3 block_size(config::warp_size, - config::max_block_size / config::warp_size, 1); - const dim3 init_grid_dim(ceildiv(num_cols, block_size.x), - ceildiv(num_rows, block_size.y), 1); - hipLaunchKernelGGL(kernel::initialize_zero_dense, dim3(init_grid_dim), - dim3(block_size), 0, 0, num_rows, num_cols, stride, - as_hip_type(result->get_values())); - auto grid_dim = ceildiv(num_rows, default_block_size); hipLaunchKernelGGL( kernel::fill_in_dense, dim3(grid_dim), dim3(default_block_size), 0, 0, @@ -724,7 +716,7 @@ void convert_to_dense(std::shared_ptr exec, } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CONVERT_TO_DENSE_KERNEL); + GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL); template diff --git a/hip/matrix/ell_kernels.hip.cpp b/hip/matrix/ell_kernels.hip.cpp index 2673447c024..2d5daeb33ac 100644 --- a/hip/matrix/ell_kernels.hip.cpp +++ b/hip/matrix/ell_kernels.hip.cpp @@ -290,38 +290,6 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL); -template -void convert_to_dense(std::shared_ptr exec, - const matrix::Ell* source, - matrix::Dense* result) -{ - const auto num_rows = result->get_size()[0]; - const auto num_cols = result->get_size()[1]; - const auto result_stride = result->get_stride(); - const auto col_idxs = source->get_const_col_idxs(); - const auto vals = source->get_const_values(); - const auto source_stride = source->get_stride(); - - const dim3 block_size(config::warp_size, - config::max_block_size / config::warp_size, 1); - const dim3 init_grid_dim(ceildiv(num_cols, block_size.x), - ceildiv(num_rows, block_size.y), 1); - hipLaunchKernelGGL(kernel::initialize_zero_dense, dim3(init_grid_dim), - dim3(block_size), 0, 0, num_rows, num_cols, - result_stride, as_hip_type(result->get_values())); - - const auto grid_dim = ceildiv(num_rows, default_block_size); - hipLaunchKernelGGL(kernel::fill_in_dense, dim3(grid_dim), - dim3(default_block_size), 0, 0, num_rows, - source->get_num_stored_elements_per_row(), source_stride, - as_hip_type(col_idxs), as_hip_type(vals), result_stride, - as_hip_type(result->get_values())); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_ELL_CONVERT_TO_DENSE_KERNEL); - - template void convert_to_csr(std::shared_ptr exec, const matrix::Ell* source, diff --git a/hip/matrix/fbcsr_kernels.hip.cpp b/hip/matrix/fbcsr_kernels.hip.cpp index 530f42e8978..a9700699199 100644 --- a/hip/matrix/fbcsr_kernels.hip.cpp +++ b/hip/matrix/fbcsr_kernels.hip.cpp @@ -99,12 +99,12 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template -void convert_to_dense(std::shared_ptr exec, - const matrix::Fbcsr* source, - matrix::Dense* result) GKO_NOT_IMPLEMENTED; +void fill_in_dense(std::shared_ptr exec, + const matrix::Fbcsr* source, + matrix::Dense* result) GKO_NOT_IMPLEMENTED; GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_FBCSR_CONVERT_TO_DENSE_KERNEL); + GKO_DECLARE_FBCSR_FILL_IN_DENSE_KERNEL); template diff --git a/hip/matrix/hybrid_kernels.hip.cpp b/hip/matrix/hybrid_kernels.hip.cpp index f65084f657c..08b3f80c951 100644 --- a/hip/matrix/hybrid_kernels.hip.cpp +++ b/hip/matrix/hybrid_kernels.hip.cpp @@ -79,15 +79,6 @@ constexpr int warps_in_block = 4; #include "common/cuda_hip/matrix/hybrid_kernels.hpp.inc" -template -void convert_to_dense(std::shared_ptr exec, - const matrix::Hybrid* source, - matrix::Dense* result) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_HYBRID_CONVERT_TO_DENSE_KERNEL); - - template void convert_to_csr(std::shared_ptr exec, const matrix::Hybrid* source, diff --git a/hip/matrix/sellp_kernels.hip.cpp b/hip/matrix/sellp_kernels.hip.cpp index c17c0c4d8d0..d1a5ae9747d 100644 --- a/hip/matrix/sellp_kernels.hip.cpp +++ b/hip/matrix/sellp_kernels.hip.cpp @@ -114,51 +114,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_SELLP_ADVANCED_SPMV_KERNEL); -template -void convert_to_dense(std::shared_ptr exec, - const matrix::Sellp* source, - matrix::Dense* result) -{ - const auto num_rows = source->get_size()[0]; - const auto num_cols = source->get_size()[1]; - const auto vals = source->get_const_values(); - const auto col_idxs = source->get_const_col_idxs(); - const auto slice_lengths = source->get_const_slice_lengths(); - const auto slice_sets = source->get_const_slice_sets(); - const auto slice_size = source->get_slice_size(); - - const auto slice_num = ceildiv(num_rows, slice_size); - - const dim3 block_size(config::warp_size, - config::max_block_size / config::warp_size, 1); - const dim3 init_grid_dim(ceildiv(num_cols, block_size.x), - ceildiv(num_rows, block_size.y), 1); - - if (num_rows > 0 && result->get_stride() > 0) { - hipLaunchKernelGGL(kernel::initialize_zero_dense, dim3(init_grid_dim), - dim3(block_size), 0, 0, num_rows, num_cols, - result->get_stride(), - as_hip_type(result->get_values())); - } - - constexpr auto threads_per_row = config::warp_size; - const auto grid_dim = - ceildiv(slice_size * slice_num * threads_per_row, default_block_size); - - if (grid_dim > 0) { - hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel::fill_in_dense), - dim3(grid_dim), dim3(default_block_size), 0, 0, num_rows, num_cols, - result->get_stride(), slice_size, as_hip_type(slice_lengths), - as_hip_type(slice_sets), as_hip_type(col_idxs), as_hip_type(vals), - as_hip_type(result->get_values())); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_SELLP_CONVERT_TO_DENSE_KERNEL); - - template void convert_to_csr(std::shared_ptr exec, const matrix::Sellp* source, diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp index 0d1bb042e70..7b82fcb4e2f 100644 --- a/include/ginkgo/core/matrix/dense.hpp +++ b/include/ginkgo/core/matrix/dense.hpp @@ -63,6 +63,9 @@ class Diagonal; template class Ell; +template +class Fbcsr; + template class Hybrid; @@ -123,6 +126,8 @@ class Dense friend class Diagonal; friend class Ell; friend class Ell; + friend class Fbcsr; + friend class Fbcsr; friend class Hybrid; friend class Hybrid; friend class Sellp; @@ -989,6 +994,14 @@ class Dense */ virtual void compute_norm1_impl(LinOp* result) const; + /** + * Resizes the matrix to the given size. + * + * If the new size matches the current size, the stride will be left + * unchanged, otherwise it will be set to the number of columns. + */ + void resize(gko::dim<2> new_size); + /** * @copydoc create_submatrix(const span, const span, const size_type) * diff --git a/omp/matrix/coo_kernels.cpp b/omp/matrix/coo_kernels.cpp index 1a9083f3d80..2dbb5d7d9c7 100644 --- a/omp/matrix/coo_kernels.cpp +++ b/omp/matrix/coo_kernels.cpp @@ -355,32 +355,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_COO_ADVANCED_SPMV2_KERNEL); -template -void convert_to_dense(std::shared_ptr exec, - const matrix::Coo* source, - matrix::Dense* result) -{ - auto coo_val = source->get_const_values(); - auto coo_col = source->get_const_col_idxs(); - auto coo_row = source->get_const_row_idxs(); - auto num_rows = result->get_size()[0]; - auto num_cols = result->get_size()[1]; -#pragma omp parallel for - for (size_type row = 0; row < num_rows; row++) { - for (size_type col = 0; col < num_cols; col++) { - result->at(row, col) = zero(); - } - } -#pragma omp parallel for - for (size_type i = 0; i < source->get_num_stored_elements(); i++) { - result->at(coo_row[i], coo_col[i]) += coo_val[i]; - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_COO_CONVERT_TO_DENSE_KERNEL); - - } // namespace coo } // namespace omp } // namespace kernels diff --git a/omp/matrix/csr_kernels.cpp b/omp/matrix/csr_kernels.cpp index 22087a20ead..0e2091cb8ee 100644 --- a/omp/matrix/csr_kernels.cpp +++ b/omp/matrix/csr_kernels.cpp @@ -554,9 +554,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL); template -void convert_to_dense(std::shared_ptr exec, - const matrix::Csr* source, - matrix::Dense* result) +void fill_in_dense(std::shared_ptr exec, + const matrix::Csr* source, + matrix::Dense* result) { auto num_rows = source->get_size()[0]; auto num_cols = source->get_size()[1]; @@ -577,7 +577,7 @@ void convert_to_dense(std::shared_ptr exec, } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CONVERT_TO_DENSE_KERNEL); + GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL); template diff --git a/omp/matrix/ell_kernels.cpp b/omp/matrix/ell_kernels.cpp index 913ce97ec3f..5906fabab46 100644 --- a/omp/matrix/ell_kernels.cpp +++ b/omp/matrix/ell_kernels.cpp @@ -258,31 +258,6 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL); -template -void convert_to_dense(std::shared_ptr exec, - const matrix::Ell* source, - matrix::Dense* result) -{ - auto num_rows = source->get_size()[0]; - auto num_cols = source->get_size()[1]; - auto num_stored_elements_per_row = - source->get_num_stored_elements_per_row(); - -#pragma omp parallel for - for (size_type row = 0; row < num_rows; row++) { - for (size_type col = 0; col < num_cols; col++) { - result->at(row, col) = zero(); - } - for (size_type i = 0; i < num_stored_elements_per_row; i++) { - result->at(row, source->col_at(row, i)) += source->val_at(row, i); - } - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_ELL_CONVERT_TO_DENSE_KERNEL); - - template void convert_to_csr(std::shared_ptr exec, const matrix::Ell* source, diff --git a/omp/matrix/fbcsr_kernels.cpp b/omp/matrix/fbcsr_kernels.cpp index 0ce83d93a03..aa43f668c43 100644 --- a/omp/matrix/fbcsr_kernels.cpp +++ b/omp/matrix/fbcsr_kernels.cpp @@ -210,13 +210,12 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template -void convert_to_dense(std::shared_ptr exec, - const matrix::Fbcsr* const source, - matrix::Dense* const result) - GKO_NOT_IMPLEMENTED; +void fill_in_dense(std::shared_ptr exec, + const matrix::Fbcsr* const source, + matrix::Dense* const result) GKO_NOT_IMPLEMENTED; GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_FBCSR_CONVERT_TO_DENSE_KERNEL); + GKO_DECLARE_FBCSR_FILL_IN_DENSE_KERNEL); template diff --git a/omp/matrix/hybrid_kernels.cpp b/omp/matrix/hybrid_kernels.cpp index 309fb23a9f5..1270fe63b7b 100644 --- a/omp/matrix/hybrid_kernels.cpp +++ b/omp/matrix/hybrid_kernels.cpp @@ -95,46 +95,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_HYBRID_SPLIT_MATRIX_DATA_KERNEL); -template -void convert_to_dense(std::shared_ptr exec, - const matrix::Hybrid* source, - matrix::Dense* result) -{ - auto num_rows = source->get_size()[0]; - auto num_cols = source->get_size()[1]; - auto ell_val = source->get_const_ell_values(); - auto ell_col = source->get_const_ell_col_idxs(); - - auto ell_num_stored_elements_per_row = - source->get_ell_num_stored_elements_per_row(); - - for (size_type row = 0; row < num_rows; row++) { -#pragma omp parallel for - for (size_type col = 0; col < num_cols; col++) { - result->at(row, col) = zero(); - } -#pragma omp parallel for - for (size_type i = 0; i < ell_num_stored_elements_per_row; i++) { - result->at(row, source->ell_col_at(row, i)) += - source->ell_val_at(row, i); - } - } - - auto coo_val = source->get_const_coo_values(); - auto coo_col = source->get_const_coo_col_idxs(); - auto coo_row = source->get_const_coo_row_idxs(); -// The following parallelization is dangerous and can fail if the -// COO matrix contains several elements assigned to the same location. -#pragma omp parallel for - for (size_type i = 0; i < source->get_coo_num_stored_elements(); i++) { - result->at(coo_row[i], coo_col[i]) += coo_val[i]; - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_HYBRID_CONVERT_TO_DENSE_KERNEL); - - template void convert_to_csr(std::shared_ptr exec, const matrix::Hybrid* source, diff --git a/omp/matrix/sellp_kernels.cpp b/omp/matrix/sellp_kernels.cpp index 4dc02af2f69..53fff2c522a 100644 --- a/omp/matrix/sellp_kernels.cpp +++ b/omp/matrix/sellp_kernels.cpp @@ -222,42 +222,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_SELLP_ADVANCED_SPMV_KERNEL); -template -void convert_to_dense(std::shared_ptr exec, - const matrix::Sellp* source, - matrix::Dense* result) -{ - auto num_rows = source->get_size()[0]; - auto num_cols = source->get_size()[1]; - auto vals = source->get_const_values(); - auto col_idxs = source->get_const_col_idxs(); - auto slice_lengths = source->get_const_slice_lengths(); - auto slice_sets = source->get_const_slice_sets(); - auto slice_size = source->get_slice_size(); - auto slice_num = - ceildiv(source->get_size()[0] + slice_size - 1, slice_size); -#pragma omp parallel for collapse(2) - for (size_type slice = 0; slice < slice_num; slice++) { - for (size_type row = 0; row < slice_size; row++) { - size_type global_row = slice * slice_size + row; - if (global_row < num_rows) { - for (size_type col = 0; col < num_cols; col++) { - result->at(global_row, col) = zero(); - } - for (size_type i = slice_sets[slice]; i < slice_sets[slice + 1]; - i++) { - result->at(global_row, col_idxs[row + i * slice_size]) += - vals[row + i * slice_size]; - } - } - } - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_SELLP_CONVERT_TO_DENSE_KERNEL); - - template void convert_to_csr(std::shared_ptr exec, const matrix::Sellp* source, diff --git a/reference/matrix/coo_kernels.cpp b/reference/matrix/coo_kernels.cpp index a67b7b2beec..d6c85a439f3 100644 --- a/reference/matrix/coo_kernels.cpp +++ b/reference/matrix/coo_kernels.cpp @@ -149,27 +149,20 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template -void convert_to_dense(std::shared_ptr exec, - const matrix::Coo* source, - matrix::Dense* result) +void fill_in_dense(std::shared_ptr exec, + const matrix::Coo* source, + matrix::Dense* result) { auto coo_val = source->get_const_values(); auto coo_col = source->get_const_col_idxs(); auto coo_row = source->get_const_row_idxs(); - auto num_rows = result->get_size()[0]; - auto num_cols = result->get_size()[1]; - for (size_type row = 0; row < num_rows; row++) { - for (size_type col = 0; col < num_cols; col++) { - result->at(row, col) = zero(); - } - } for (size_type i = 0; i < source->get_num_stored_elements(); i++) { result->at(coo_row[i], coo_col[i]) += coo_val[i]; } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_COO_CONVERT_TO_DENSE_KERNEL); + GKO_DECLARE_COO_FILL_IN_DENSE_KERNEL); template diff --git a/reference/matrix/csr_kernels.cpp b/reference/matrix/csr_kernels.cpp index 2e7272afbbd..991c914f336 100644 --- a/reference/matrix/csr_kernels.cpp +++ b/reference/matrix/csr_kernels.cpp @@ -375,9 +375,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template -void convert_to_dense(std::shared_ptr exec, - const matrix::Csr* source, - matrix::Dense* result) +void fill_in_dense(std::shared_ptr exec, + const matrix::Csr* source, + matrix::Dense* result) { auto num_rows = source->get_size()[0]; auto num_cols = source->get_size()[1]; @@ -386,9 +386,6 @@ void convert_to_dense(std::shared_ptr exec, auto vals = source->get_const_values(); for (size_type row = 0; row < num_rows; ++row) { - for (size_type col = 0; col < num_cols; ++col) { - result->at(row, col) = zero(); - } for (size_type i = row_ptrs[row]; i < static_cast(row_ptrs[row + 1]); ++i) { result->at(row, col_idxs[i]) = vals[i]; @@ -397,7 +394,7 @@ void convert_to_dense(std::shared_ptr exec, } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CONVERT_TO_DENSE_KERNEL); + GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL); template diff --git a/reference/matrix/ell_kernels.cpp b/reference/matrix/ell_kernels.cpp index 2f4816f6c36..c6f5668da91 100644 --- a/reference/matrix/ell_kernels.cpp +++ b/reference/matrix/ell_kernels.cpp @@ -193,9 +193,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template -void convert_to_dense(std::shared_ptr exec, - const matrix::Ell* source, - matrix::Dense* result) +void fill_in_dense(std::shared_ptr exec, + const matrix::Ell* source, + matrix::Dense* result) { auto num_rows = source->get_size()[0]; auto num_cols = source->get_size()[1]; @@ -203,9 +203,6 @@ void convert_to_dense(std::shared_ptr exec, source->get_num_stored_elements_per_row(); for (size_type row = 0; row < num_rows; row++) { - for (size_type col = 0; col < num_cols; col++) { - result->at(row, col) = zero(); - } for (size_type i = 0; i < num_stored_elements_per_row; i++) { result->at(row, source->col_at(row, i)) += source->val_at(row, i); } @@ -213,7 +210,7 @@ void convert_to_dense(std::shared_ptr exec, } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_ELL_CONVERT_TO_DENSE_KERNEL); + GKO_DECLARE_ELL_FILL_IN_DENSE_KERNEL); template diff --git a/reference/matrix/fbcsr_kernels.cpp b/reference/matrix/fbcsr_kernels.cpp index f181cfbaefc..e630c8c28a3 100644 --- a/reference/matrix/fbcsr_kernels.cpp +++ b/reference/matrix/fbcsr_kernels.cpp @@ -208,9 +208,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template -void convert_to_dense(const std::shared_ptr, - const matrix::Fbcsr* const source, - matrix::Dense* const result) +void fill_in_dense(const std::shared_ptr, + const matrix::Fbcsr* const source, + matrix::Dense* const result) { const int bs = source->get_block_size(); const IndexType nbrows = source->get_num_block_rows(); @@ -226,14 +226,6 @@ void convert_to_dense(const std::shared_ptr, vals}; for (IndexType brow = 0; brow < nbrows; ++brow) { - for (size_type bcol = 0; bcol < nbcols; ++bcol) { - for (int ib = 0; ib < bs; ib++) { - for (int jb = 0; jb < bs; jb++) { - result->at(brow * bs + ib, bcol * bs + jb) = - zero(); - } - } - } for (IndexType ibnz = row_ptrs[brow]; ibnz < row_ptrs[brow + 1]; ++ibnz) { for (int ib = 0; ib < bs; ib++) { @@ -248,7 +240,7 @@ void convert_to_dense(const std::shared_ptr, } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_FBCSR_CONVERT_TO_DENSE_KERNEL); + GKO_DECLARE_FBCSR_FILL_IN_DENSE_KERNEL); template diff --git a/reference/matrix/hybrid_kernels.cpp b/reference/matrix/hybrid_kernels.cpp index 6af99e4e3eb..ab181f6203c 100644 --- a/reference/matrix/hybrid_kernels.cpp +++ b/reference/matrix/hybrid_kernels.cpp @@ -103,41 +103,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_HYBRID_SPLIT_MATRIX_DATA_KERNEL); -template -void convert_to_dense(std::shared_ptr exec, - const matrix::Hybrid* source, - matrix::Dense* result) -{ - auto num_rows = source->get_size()[0]; - auto num_cols = source->get_size()[1]; - auto ell_val = source->get_const_ell_values(); - auto ell_col = source->get_const_ell_col_idxs(); - - auto ell_num_stored_elements_per_row = - source->get_ell_num_stored_elements_per_row(); - - for (size_type row = 0; row < num_rows; row++) { - for (size_type col = 0; col < num_cols; col++) { - result->at(row, col) = zero(); - } - for (size_type i = 0; i < ell_num_stored_elements_per_row; i++) { - result->at(row, source->ell_col_at(row, i)) += - source->ell_val_at(row, i); - } - } - - auto coo_val = source->get_const_coo_values(); - auto coo_col = source->get_const_coo_col_idxs(); - auto coo_row = source->get_const_coo_row_idxs(); - for (size_type i = 0; i < source->get_coo_num_stored_elements(); i++) { - result->at(coo_row[i], coo_col[i]) += coo_val[i]; - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_HYBRID_CONVERT_TO_DENSE_KERNEL); - - template void convert_to_csr(std::shared_ptr exec, const matrix::Hybrid* source, diff --git a/reference/matrix/sellp_kernels.cpp b/reference/matrix/sellp_kernels.cpp index 1f92bdedf3c..443f39f56c2 100644 --- a/reference/matrix/sellp_kernels.cpp +++ b/reference/matrix/sellp_kernels.cpp @@ -191,9 +191,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template -void convert_to_dense(std::shared_ptr exec, - const matrix::Sellp* source, - matrix::Dense* result) +void fill_in_dense(std::shared_ptr exec, + const matrix::Sellp* source, + matrix::Dense* result) { auto num_rows = source->get_size()[0]; auto num_cols = source->get_size()[1]; @@ -210,9 +210,6 @@ void convert_to_dense(std::shared_ptr exec, if (global_row >= num_rows) { break; } - for (size_type col = 0; col < num_cols; col++) { - result->at(global_row, col) = zero(); - } for (size_type i = slice_sets[slice]; i < slice_sets[slice + 1]; i++) { result->at(global_row, col_idxs[row + i * slice_size]) += @@ -223,7 +220,7 @@ void convert_to_dense(std::shared_ptr exec, } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_SELLP_CONVERT_TO_DENSE_KERNEL); + GKO_DECLARE_SELLP_FILL_IN_DENSE_KERNEL); template From e9803b91f81f533be0745046ed8ebbdf54ecce33 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 30 Nov 2021 19:13:40 +0100 Subject: [PATCH 11/32] unify convert_to(Csr) implementations --- common/unified/matrix/ell_kernels.cpp | 54 ++++++++++++ common/unified/matrix/hybrid_kernels.cpp | 61 +++++++++++++ common/unified/matrix/sellp_kernels.cpp | 34 ++++++++ core/device_hooks/common_kernels.inc.cpp | 5 +- core/matrix/coo.cpp | 32 ++++--- core/matrix/csr.cpp | 10 +++ core/matrix/ell.cpp | 56 +++++------- core/matrix/ell_kernels.hpp | 57 ++++++------- core/matrix/fbcsr.cpp | 9 +- core/matrix/hybrid.cpp | 41 ++++++--- core/matrix/hybrid_kernels.hpp | 11 +-- cuda/matrix/ell_kernels.cu | 76 ----------------- cuda/matrix/hybrid_kernels.cu | 101 ---------------------- cuda/test/matrix/ell_kernels.cpp | 36 ++------ cuda/test/matrix/hybrid_kernels.cpp | 13 --- dpcpp/matrix/ell_kernels.dp.cpp | 75 ---------------- dpcpp/matrix/hybrid_kernels.dp.cpp | 100 ---------------------- hip/matrix/ell_kernels.hip.cpp | 80 ----------------- hip/matrix/hybrid_kernels.hip.cpp | 104 ----------------------- hip/test/matrix/ell_kernels.hip.cpp | 36 ++------ hip/test/matrix/hybrid_kernels.hip.cpp | 13 --- include/ginkgo/core/matrix/csr.hpp | 6 ++ omp/matrix/ell_kernels.cpp | 42 --------- omp/matrix/hybrid_kernels.cpp | 85 ------------------ omp/test/matrix/ell_kernels.cpp | 14 --- omp/test/matrix/hybrid_kernels.cpp | 13 --- reference/matrix/ell_kernels.cpp | 35 ++------ reference/matrix/hybrid_kernels.cpp | 22 +---- reference/test/matrix/hybrid_kernels.cpp | 11 --- 29 files changed, 281 insertions(+), 951 deletions(-) diff --git a/common/unified/matrix/ell_kernels.cpp b/common/unified/matrix/ell_kernels.cpp index d93e91ea18b..6945fcc4d49 100644 --- a/common/unified/matrix/ell_kernels.cpp +++ b/common/unified/matrix/ell_kernels.cpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common/unified/base/kernel_launch.hpp" #include "common/unified/base/kernel_launch_reduction.hpp" +#include "core/matrix/dense_kernels.hpp" namespace gko { @@ -126,6 +127,59 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_ELL_FILL_IN_DENSE_KERNEL); +template +void convert_to_csr(std::shared_ptr exec, + const matrix::Ell* source, + matrix::Csr* result) +{ + // ELL is stored in column-major, so we swap row and column parameters + run_kernel( + exec, + [] GKO_KERNEL(auto ell_col, auto row, auto ell_stride, auto in_cols, + auto in_vals, auto out_row_ptrs, auto out_cols, + auto out_vals) { + const auto ell_idx = ell_col * ell_stride + row; + const auto row_begin = out_row_ptrs[row]; + const auto row_size = out_row_ptrs[row + 1] - row_begin; + if (ell_col < row_size) { + out_cols[row_begin + ell_col] = in_cols[ell_idx]; + out_vals[row_begin + ell_col] = in_vals[ell_idx]; + } + }, + dim<2>{source->get_num_stored_elements_per_row(), + source->get_size()[0]}, + static_cast(source->get_stride()), source->get_const_col_idxs(), + source->get_const_values(), result->get_row_ptrs(), + result->get_col_idxs(), result->get_values()); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_ELL_CONVERT_TO_CSR_KERNEL); + + +template +void count_nonzeros_per_row(std::shared_ptr exec, + const matrix::Ell* source, + IndexType* result) +{ + // ELL is stored in column-major, so we swap row and column parameters + run_kernel_col_reduction( + exec, + [] GKO_KERNEL(auto ell_col, auto row, auto ell_stride, auto in_vals) { + const auto ell_idx = ell_col * ell_stride + row; + return is_nonzero(in_vals[ell_idx]) ? 1 : 0; + }, + [] GKO_KERNEL(auto a, auto b) { return a + b; }, + [] GKO_KERNEL(auto a) { return a; }, IndexType{}, result, + dim<2>{source->get_num_stored_elements_per_row(), + source->get_size()[0]}, + static_cast(source->get_stride()), source->get_const_values()); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_ELL_COUNT_NONZEROS_PER_ROW_KERNEL); + + } // namespace ell } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels diff --git a/common/unified/matrix/hybrid_kernels.cpp b/common/unified/matrix/hybrid_kernels.cpp index 9cf6f5fc778..7bc27b04b6b 100644 --- a/common/unified/matrix/hybrid_kernels.cpp +++ b/common/unified/matrix/hybrid_kernels.cpp @@ -59,6 +59,67 @@ void compute_row_nnz(std::shared_ptr exec, } +template +void convert_to_csr(std::shared_ptr exec, + const matrix::Hybrid* source, + const IndexType* ell_row_ptrs, + const IndexType* coo_row_ptrs, + matrix::Csr* result) +{ + const auto ell = source->get_ell(); + const auto coo = source->get_coo(); + // ELL is stored in column-major, so we swap row and column parameters + run_kernel( + exec, + [] GKO_KERNEL(auto ell_col, auto row, auto ell_stride, auto in_cols, + auto in_vals, auto ell_row_ptrs, auto coo_row_ptrs, + auto out_cols, auto out_vals) { + const auto ell_idx = ell_col * ell_stride + row; + const auto out_row_begin = ell_row_ptrs[row] + coo_row_ptrs[row]; + const auto ell_row_size = ell_row_ptrs[row + 1] - ell_row_ptrs[row]; + if (ell_col < ell_row_size) { + const auto out_idx = out_row_begin + ell_col; + out_cols[out_idx] = in_cols[ell_idx]; + out_vals[out_idx] = in_vals[ell_idx]; + } + }, + dim<2>{ell->get_num_stored_elements_per_row(), ell->get_size()[0]}, + static_cast(ell->get_stride()), ell->get_const_col_idxs(), + ell->get_const_values(), ell_row_ptrs, coo_row_ptrs, + result->get_col_idxs(), result->get_values()); + run_kernel( + exec, + [] GKO_KERNEL(auto idx, auto ell_row_ptrs, auto coo_row_ptrs, + auto out_row_ptrs) { + out_row_ptrs[idx] = ell_row_ptrs[idx] + coo_row_ptrs[idx]; + }, + source->get_size()[0] + 1, ell_row_ptrs, coo_row_ptrs, + result->get_row_ptrs()); + run_kernel( + exec, + [] GKO_KERNEL(auto idx, auto in_rows, auto in_cols, auto in_vals, + auto ell_row_ptrs, auto coo_row_ptrs, auto out_cols, + auto out_vals) { + const auto row = in_rows[idx]; + const auto col = in_cols[idx]; + const auto val = in_vals[idx]; + const auto coo_row_begin = coo_row_ptrs[row]; + const auto coo_local_pos = idx - coo_row_begin; + // compute row_ptrs[row] + ell_row_size[row] + const auto out_row_begin = ell_row_ptrs[row + 1] + coo_row_begin; + const auto out_idx = out_row_begin + coo_local_pos; + out_cols[out_idx] = col; + out_vals[out_idx] = val; + }, + coo->get_num_stored_elements(), coo->get_const_row_idxs(), + coo->get_const_col_idxs(), coo->get_const_values(), ell_row_ptrs, + coo_row_ptrs, result->get_col_idxs(), result->get_values()); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_HYBRID_CONVERT_TO_CSR_KERNEL); + + } // namespace hybrid } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels diff --git a/common/unified/matrix/sellp_kernels.cpp b/common/unified/matrix/sellp_kernels.cpp index 9a8fd2f687f..4c6419119b3 100644 --- a/common/unified/matrix/sellp_kernels.cpp +++ b/common/unified/matrix/sellp_kernels.cpp @@ -144,6 +144,40 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_SELLP_FILL_IN_DENSE_KERNEL); +template +void convert_to_csr(std::shared_ptr exec, + const matrix::Sellp* source, + matrix::Csr* result) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto slice_size, auto slice_sets, auto cols, + auto values, auto out_row_ptrs, auto out_cols, + auto out_vals) { + const auto row_begin = out_row_ptrs[row]; + const auto row_end = out_row_ptrs[row + 1]; + const auto slice = row / slice_size; + const auto local_row = row % slice_size; + const auto slice_begin = slice_sets[slice]; + const auto slice_end = slice_sets[slice + 1]; + const auto slice_length = slice_end - slice_begin; + auto in_idx = slice_begin * slice_size + local_row; + for (auto i = row_begin; i < row_end; i++) { + out_cols[i] = cols[in_idx]; + out_vals[i] = values[in_idx]; + in_idx += slice_size; + } + }, + source->get_size()[0], source->get_slice_size(), + source->get_const_slice_sets(), source->get_const_col_idxs(), + source->get_const_values(), result->get_row_ptrs(), + result->get_col_idxs(), result->get_values()); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_SELLP_CONVERT_TO_CSR_KERNEL); + + } // namespace sellp } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index 68d6e950983..358b5f67b1c 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -548,9 +548,7 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_FILL_IN_MATRIX_DATA_KERNEL); GKO_STUB_INDEX_TYPE(GKO_DECLARE_ELL_COMPUTE_MAX_ROW_NNZ_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_FILL_IN_DENSE_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_CONVERT_TO_CSR_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_COUNT_NONZEROS_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_ELL_CALCULATE_NONZEROS_PER_ROW_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_COUNT_NONZEROS_PER_ROW_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_EXTRACT_DIAGONAL_KERNEL); @@ -585,7 +583,6 @@ namespace hybrid { GKO_STUB(GKO_DECLARE_HYBRID_COMPUTE_ROW_NNZ); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_HYBRID_SPLIT_MATRIX_DATA_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_HYBRID_CONVERT_TO_CSR_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_HYBRID_COUNT_NONZEROS_KERNEL); } // namespace hybrid diff --git a/core/matrix/coo.cpp b/core/matrix/coo.cpp index c6041109610..9c7a56bb83c 100644 --- a/core/matrix/coo.cpp +++ b/core/matrix/coo.cpp @@ -149,16 +149,15 @@ void Coo::convert_to( Csr* result) const { auto exec = this->get_executor(); - auto tmp = Csr::create( - exec, this->get_size(), this->get_num_stored_elements(), - result->get_strategy()); - tmp->values_ = this->values_; - tmp->col_idxs_ = this->col_idxs_; + result->set_size(this->get_size()); + result->row_ptrs_.resize_and_reset(this->get_size()[0] + 1); + result->col_idxs_ = this->col_idxs_; + result->values_ = this->values_; exec->run(coo::make_convert_idxs_to_ptrs( this->get_const_row_idxs(), this->get_num_stored_elements(), - this->get_size()[0], tmp->get_row_ptrs())); - tmp->make_srow(); - tmp->move_to(result); + this->get_size()[0], + make_temporary_clone(exec, &result->row_ptrs_)->get_data())); + result->make_srow(); } @@ -167,15 +166,14 @@ void Coo::move_to(Csr* result) { auto exec = this->get_executor(); const auto nnz = this->get_num_stored_elements(); - auto tmp = Csr::create(exec, this->get_size(), nnz, - result->get_strategy()); - tmp->values_ = std::move(this->values_); - tmp->col_idxs_ = std::move(this->col_idxs_); - exec->run(coo::make_convert_idxs_to_ptrs(this->get_const_row_idxs(), nnz, - this->get_size()[0], - tmp->get_row_ptrs())); - tmp->make_srow(); - tmp->move_to(result); + result->set_size(this->get_size()); + result->row_ptrs_.resize_and_reset(this->get_size()[0] + 1); + result->col_idxs_ = std::move(this->col_idxs_); + result->values_ = std::move(this->values_); + exec->run(coo::make_convert_idxs_to_ptrs( + this->get_const_row_idxs(), nnz, this->get_size()[0], + make_temporary_clone(exec, &result->row_ptrs_)->get_data())); + result->make_srow(); } diff --git a/core/matrix/csr.cpp b/core/matrix/csr.cpp index dba59b3e58d..637a8b6a43d 100644 --- a/core/matrix/csr.cpp +++ b/core/matrix/csr.cpp @@ -368,6 +368,16 @@ void Csr::write(mat_data& data) const } +template +void Csr::resize(gko::dim<2> new_size, size_type nnz) +{ + this->set_size(new_size); + this->row_ptrs_.resize_and_reset(new_size[0] + 1); + this->col_idxs_.resize_and_reset(nnz); + this->values_.resize_and_reset(nnz); +} + + template std::unique_ptr Csr::transpose() const { diff --git a/core/matrix/ell.cpp b/core/matrix/ell.cpp index e5c1ef0ddb6..4282cbc0721 100644 --- a/core/matrix/ell.cpp +++ b/core/matrix/ell.cpp @@ -49,6 +49,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/components/absolute_array_kernels.hpp" #include "core/components/device_matrix_data_kernels.hpp" #include "core/components/fill_array_kernels.hpp" +#include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/ell_kernels.hpp" @@ -65,11 +66,10 @@ GKO_REGISTER_OPERATION(compute_max_row_nnz, ell::compute_max_row_nnz); GKO_REGISTER_OPERATION(fill_in_matrix_data, ell::fill_in_matrix_data); GKO_REGISTER_OPERATION(fill_in_dense, ell::fill_in_dense); GKO_REGISTER_OPERATION(convert_to_csr, ell::convert_to_csr); -GKO_REGISTER_OPERATION(count_nonzeros, ell::count_nonzeros); -GKO_REGISTER_OPERATION(calculate_nonzeros_per_row, - ell::calculate_nonzeros_per_row); +GKO_REGISTER_OPERATION(count_nonzeros_per_row, ell::count_nonzeros_per_row); GKO_REGISTER_OPERATION(extract_diagonal, ell::extract_diagonal); GKO_REGISTER_OPERATION(fill_array, components::fill_array); +GKO_REGISTER_OPERATION(prefix_sum, components::prefix_sum); GKO_REGISTER_OPERATION(inplace_absolute_array, components::inplace_absolute_array); GKO_REGISTER_OPERATION(outplace_absolute_array, @@ -80,32 +80,6 @@ GKO_REGISTER_OPERATION(outplace_absolute_array, } // namespace ell -namespace { - - -template -size_type calculate_max_nnz_per_row( - const matrix_data& data) -{ - size_type nnz = 0; - IndexType current_row = 0; - size_type num_stored_elements_per_row = 0; - for (const auto& elem : data.nonzeros) { - if (elem.row != current_row) { - current_row = elem.row; - num_stored_elements_per_row = - std::max(num_stored_elements_per_row, nnz); - nnz = 0; - } - nnz += (elem.value != zero()); - } - return std::max(num_stored_elements_per_row, nnz); -} - - -} // namespace - - template void Ell::apply_impl(const LinOp* b, LinOp* x) const { @@ -176,16 +150,26 @@ void Ell::convert_to( Csr* result) const { auto exec = this->get_executor(); + const auto num_rows = this->get_size()[0]; - size_type num_stored_elements = 0; - exec->run(ell::make_count_nonzeros(this, &num_stored_elements)); + Array row_ptrs{exec, num_rows + 1}; - auto tmp = Csr::create( - exec, this->get_size(), num_stored_elements, result->get_strategy()); - exec->run(ell::make_convert_to_csr(this, tmp.get())); + exec->run(ell::make_count_nonzeros_per_row(this, row_ptrs.get_data())); + exec->run(ell::make_prefix_sum(row_ptrs.get_data(), num_rows + 1)); - tmp->make_srow(); - tmp->move_to(result); + const auto nnz = static_cast( + exec->copy_val_to_host(row_ptrs.get_const_data() + num_rows)); + + result->row_ptrs_ = row_ptrs; + result->resize(this->get_size(), nnz); + + { + auto tmp = make_temporary_clone(exec, result); + tmp->row_ptrs_ = row_ptrs; + exec->run(ell::make_convert_to_csr( + this, make_temporary_clone(exec, result).get())); + } + result->make_srow(); } diff --git a/core/matrix/ell_kernels.hpp b/core/matrix/ell_kernels.hpp index e7c3eaa232c..494c3bf08a3 100644 --- a/core/matrix/ell_kernels.hpp +++ b/core/matrix/ell_kernels.hpp @@ -86,45 +86,36 @@ namespace kernels { const matrix::Ell* source, \ matrix::Csr* result) -#define GKO_DECLARE_ELL_COUNT_NONZEROS_KERNEL(ValueType, IndexType) \ - void count_nonzeros(std::shared_ptr exec, \ - const matrix::Ell* source, \ - size_type* result) - -#define GKO_DECLARE_ELL_CALCULATE_NONZEROS_PER_ROW_KERNEL(ValueType, \ - IndexType) \ - void calculate_nonzeros_per_row( \ - std::shared_ptr exec, \ - const matrix::Ell* source, \ - Array* result) +#define GKO_DECLARE_ELL_COUNT_NONZEROS_PER_ROW_KERNEL(ValueType, IndexType) \ + void count_nonzeros_per_row( \ + std::shared_ptr exec, \ + const matrix::Ell* source, IndexType* result) #define GKO_DECLARE_ELL_EXTRACT_DIAGONAL_KERNEL(ValueType, IndexType) \ void extract_diagonal(std::shared_ptr exec, \ const matrix::Ell* orig, \ matrix::Diagonal* diag) -#define GKO_DECLARE_ALL_AS_TEMPLATES \ - template \ - GKO_DECLARE_ELL_SPMV_KERNEL(InputValueType, MatrixValueType, \ - OutputValueType, IndexType); \ - template \ - GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL(InputValueType, MatrixValueType, \ - OutputValueType, IndexType); \ - template \ - GKO_DECLARE_ELL_COMPUTE_MAX_ROW_NNZ_KERNEL(IndexType); \ - template \ - GKO_DECLARE_ELL_FILL_IN_MATRIX_DATA_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_ELL_FILL_IN_DENSE_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_ELL_CONVERT_TO_CSR_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_ELL_COUNT_NONZEROS_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_ELL_CALCULATE_NONZEROS_PER_ROW_KERNEL(ValueType, IndexType); \ - template \ +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + template \ + GKO_DECLARE_ELL_SPMV_KERNEL(InputValueType, MatrixValueType, \ + OutputValueType, IndexType); \ + template \ + GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL(InputValueType, MatrixValueType, \ + OutputValueType, IndexType); \ + template \ + GKO_DECLARE_ELL_COMPUTE_MAX_ROW_NNZ_KERNEL(IndexType); \ + template \ + GKO_DECLARE_ELL_FILL_IN_MATRIX_DATA_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_ELL_FILL_IN_DENSE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_ELL_CONVERT_TO_CSR_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_ELL_COUNT_NONZEROS_PER_ROW_KERNEL(ValueType, IndexType); \ + template \ GKO_DECLARE_ELL_EXTRACT_DIAGONAL_KERNEL(ValueType, IndexType) diff --git a/core/matrix/fbcsr.cpp b/core/matrix/fbcsr.cpp index 867f13654d9..de3e0bfe32b 100644 --- a/core/matrix/fbcsr.cpp +++ b/core/matrix/fbcsr.cpp @@ -178,11 +178,10 @@ void Fbcsr::convert_to( Csr* const result) const { auto exec = this->get_executor(); - auto tmp = Csr::create( - exec, this->get_size(), this->get_num_stored_elements(), - result->get_strategy()); - exec->run(fbcsr::make_convert_to_csr(this, tmp.get())); - tmp->move_to(result); + result->resize(this->get_size(), this->get_num_stored_elements()); + exec->run(fbcsr::make_convert_to_csr( + this, make_temporary_clone(exec, result).get())); + result->make_srow(); } diff --git a/core/matrix/hybrid.cpp b/core/matrix/hybrid.cpp index 890cdec2251..b7d7e0ce539 100644 --- a/core/matrix/hybrid.cpp +++ b/core/matrix/hybrid.cpp @@ -48,6 +48,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/components/absolute_array_kernels.hpp" #include "core/components/device_matrix_data_kernels.hpp" #include "core/components/fill_array_kernels.hpp" +#include "core/components/format_conversion_kernels.hpp" +#include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/coo_kernels.hpp" #include "core/matrix/ell_kernels.hpp" #include "core/matrix/hybrid_kernels.hpp" @@ -64,11 +66,13 @@ GKO_REGISTER_OPERATION(compute_row_nnz, hybrid::compute_row_nnz); GKO_REGISTER_OPERATION(split_matrix_data, hybrid::split_matrix_data); GKO_REGISTER_OPERATION(ell_fill_in_dense, ell::fill_in_dense); GKO_REGISTER_OPERATION(coo_fill_in_dense, coo::fill_in_dense); +GKO_REGISTER_OPERATION(ell_extract_diagonal, ell::extract_diagonal); +GKO_REGISTER_OPERATION(coo_extract_diagonal, coo::extract_diagonal); +GKO_REGISTER_OPERATION(ell_count_nonzeros_per_row, ell::count_nonzeros_per_row); +GKO_REGISTER_OPERATION(convert_idxs_to_ptrs, components::convert_idxs_to_ptrs); GKO_REGISTER_OPERATION(convert_to_csr, hybrid::convert_to_csr); -GKO_REGISTER_OPERATION(count_nonzeros, hybrid::count_nonzeros); -GKO_REGISTER_OPERATION(extract_coo_diagonal, coo::extract_diagonal); -GKO_REGISTER_OPERATION(extract_ell_diagonal, ell::extract_diagonal); GKO_REGISTER_OPERATION(fill_array, components::fill_array); +GKO_REGISTER_OPERATION(prefix_sum, components::prefix_sum); GKO_REGISTER_OPERATION(inplace_absolute_array, components::inplace_absolute_array); GKO_REGISTER_OPERATION(outplace_absolute_array, @@ -156,16 +160,29 @@ void Hybrid::convert_to( Csr* result) const { auto exec = this->get_executor(); + const auto num_rows = this->get_size()[0]; - size_type num_stored_elements = 0; - exec->run(hybrid::make_count_nonzeros(this, &num_stored_elements)); + Array ell_row_ptrs{exec, num_rows + 1}; + Array coo_row_ptrs{exec, num_rows + 1}; - auto tmp = Csr::create( - exec, this->get_size(), num_stored_elements, result->get_strategy()); - exec->run(hybrid::make_convert_to_csr(this, tmp.get())); + exec->run(hybrid::make_ell_count_nonzeros_per_row(this->get_ell(), + ell_row_ptrs.get_data())); + exec->run(hybrid::make_prefix_sum(ell_row_ptrs.get_data(), num_rows + 1)); + exec->run(hybrid::make_convert_idxs_to_ptrs( + this->get_const_coo_row_idxs(), this->get_coo_num_stored_elements(), + num_rows, coo_row_ptrs.get_data())); - tmp->make_srow(); - tmp->move_to(result); + const auto nnz = static_cast( + exec->copy_val_to_host(ell_row_ptrs.get_const_data() + num_rows) + + exec->copy_val_to_host(coo_row_ptrs.get_const_data() + num_rows)); + + result->resize(this->get_size(), nnz); + + exec->run(hybrid::make_convert_to_csr( + this, ell_row_ptrs.get_const_data(), coo_row_ptrs.get_const_data(), + make_temporary_clone(exec, result).get())); + + result->make_srow(); } @@ -252,8 +269,8 @@ Hybrid::extract_diagonal() const auto diag = Diagonal::create(exec, diag_size); exec->run(hybrid::make_fill_array(diag->get_values(), diag->get_size()[0], zero())); - exec->run(hybrid::make_extract_ell_diagonal(this->get_ell(), lend(diag))); - exec->run(hybrid::make_extract_coo_diagonal(this->get_coo(), lend(diag))); + exec->run(hybrid::make_ell_extract_diagonal(this->get_ell(), lend(diag))); + exec->run(hybrid::make_coo_extract_diagonal(this->get_coo(), lend(diag))); return diag; } diff --git a/core/matrix/hybrid_kernels.hpp b/core/matrix/hybrid_kernels.hpp index fddab172cd8..07670678a1d 100644 --- a/core/matrix/hybrid_kernels.hpp +++ b/core/matrix/hybrid_kernels.hpp @@ -62,21 +62,16 @@ namespace kernels { #define GKO_DECLARE_HYBRID_CONVERT_TO_CSR_KERNEL(ValueType, IndexType) \ void convert_to_csr(std::shared_ptr exec, \ const matrix::Hybrid* source, \ + const IndexType* ell_row_ptrs, \ + const IndexType* coo_row_ptrs, \ matrix::Csr* result) -#define GKO_DECLARE_HYBRID_COUNT_NONZEROS_KERNEL(ValueType, IndexType) \ - void count_nonzeros(std::shared_ptr exec, \ - const matrix::Hybrid* source, \ - size_type* result) - #define GKO_DECLARE_ALL_AS_TEMPLATES \ GKO_DECLARE_HYBRID_COMPUTE_ROW_NNZ; \ template \ GKO_DECLARE_HYBRID_SPLIT_MATRIX_DATA_KERNEL(ValueType, IndexType); \ template \ - GKO_DECLARE_HYBRID_CONVERT_TO_CSR_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_HYBRID_COUNT_NONZEROS_KERNEL(ValueType, IndexType) + GKO_DECLARE_HYBRID_CONVERT_TO_CSR_KERNEL(ValueType, IndexType) GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(hybrid, GKO_DECLARE_ALL_AS_TEMPLATES); diff --git a/cuda/matrix/ell_kernels.cu b/cuda/matrix/ell_kernels.cu index d935a56c905..4c7db28eb37 100644 --- a/cuda/matrix/ell_kernels.cu +++ b/cuda/matrix/ell_kernels.cu @@ -287,82 +287,6 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL); -template -void convert_to_csr(std::shared_ptr exec, - const matrix::Ell* source, - matrix::Csr* result) -{ - auto num_rows = result->get_size()[0]; - - auto row_ptrs = result->get_row_ptrs(); - auto col_idxs = result->get_col_idxs(); - auto values = result->get_values(); - - const auto stride = source->get_stride(); - const auto max_nnz_per_row = source->get_num_stored_elements_per_row(); - - constexpr auto rows_per_block = - ceildiv(default_block_size, config::warp_size); - const auto grid_dim_nnz = ceildiv(source->get_size()[0], rows_per_block); - - kernel::count_nnz_per_row<<>>( - num_rows, max_nnz_per_row, stride, - as_cuda_type(source->get_const_values()), as_cuda_type(row_ptrs)); - - components::prefix_sum(exec, row_ptrs, num_rows + 1); - - size_type grid_dim = ceildiv(num_rows, default_block_size); - - kernel::fill_in_csr<<>>( - num_rows, max_nnz_per_row, stride, - as_cuda_type(source->get_const_values()), - as_cuda_type(source->get_const_col_idxs()), as_cuda_type(row_ptrs), - as_cuda_type(col_idxs), as_cuda_type(values)); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_ELL_CONVERT_TO_CSR_KERNEL); - - -template -void count_nonzeros(std::shared_ptr exec, - const matrix::Ell* source, - size_type* result) -{ - const auto num_rows = source->get_size()[0]; - auto nnz_per_row = Array(exec, num_rows); - - calculate_nonzeros_per_row(exec, source, &nnz_per_row); - - *result = reduce_add_array(exec, num_rows, nnz_per_row.get_const_data()); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_ELL_COUNT_NONZEROS_KERNEL); - - -template -void calculate_nonzeros_per_row(std::shared_ptr exec, - const matrix::Ell* source, - Array* result) -{ - const auto num_rows = source->get_size()[0]; - const auto max_nnz_per_row = source->get_num_stored_elements_per_row(); - const auto stride = source->get_stride(); - const auto values = source->get_const_values(); - - const auto warp_size = config::warp_size; - const auto grid_dim = ceildiv(num_rows * warp_size, default_block_size); - - kernel::count_nnz_per_row<<>>( - num_rows, max_nnz_per_row, stride, as_cuda_type(values), - as_cuda_type(result->get_data())); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_ELL_CALCULATE_NONZEROS_PER_ROW_KERNEL); - - template void extract_diagonal(std::shared_ptr exec, const matrix::Ell* orig, diff --git a/cuda/matrix/hybrid_kernels.cu b/cuda/matrix/hybrid_kernels.cu index 9a15798bf6a..9db0cddce2a 100644 --- a/cuda/matrix/hybrid_kernels.cu +++ b/cuda/matrix/hybrid_kernels.cu @@ -78,107 +78,6 @@ constexpr int warps_in_block = 4; #include "common/cuda_hip/matrix/hybrid_kernels.hpp.inc" -template -void convert_to_csr(std::shared_ptr exec, - const matrix::Hybrid* source, - matrix::Csr* result) -{ - const auto num_rows = source->get_size()[0]; - auto coo_offset = Array(exec, num_rows + 1); - auto coo_val = source->get_const_coo_values(); - auto coo_col = source->get_const_coo_col_idxs(); - auto coo_row = source->get_const_coo_row_idxs(); - auto ell_val = source->get_const_ell_values(); - auto ell_col = source->get_const_ell_col_idxs(); - const auto stride = source->get_ell_stride(); - const auto max_nnz_per_row = source->get_ell_num_stored_elements_per_row(); - const auto coo_num_stored_elements = source->get_coo_num_stored_elements(); - - // Compute the row offset of Coo without zeros - components::convert_idxs_to_ptrs(exec, coo_row, coo_num_stored_elements, - num_rows, coo_offset.get_data()); - - // Compute the row ptrs of Csr - auto row_ptrs = result->get_row_ptrs(); - auto coo_row_ptrs = Array(exec, num_rows); - - components::fill_array(exec, row_ptrs, num_rows + 1, zero()); - size_type grid_num = ceildiv(num_rows, warps_in_block); - ell::kernel::count_nnz_per_row<<>>( - num_rows, max_nnz_per_row, stride, as_cuda_type(ell_val), - as_cuda_type(row_ptrs)); - - components::fill_array(exec, coo_row_ptrs.get_data(), num_rows, - zero()); - - auto nwarps = - coo::host_kernel::calculate_nwarps(exec, coo_num_stored_elements); - if (nwarps > 0) { - int num_lines = - ceildiv(coo_num_stored_elements, nwarps * config::warp_size); - const dim3 coo_block(config::warp_size, warps_in_block, 1); - const dim3 coo_grid(ceildiv(nwarps, warps_in_block), 1); - - kernel::count_coo_row_nnz<<>>( - coo_num_stored_elements, num_lines, as_cuda_type(coo_val), - as_cuda_type(coo_row), as_cuda_type(coo_row_ptrs.get_data())); - } - - kernel::add<<>>( - num_rows, as_cuda_type(row_ptrs), - as_cuda_type(coo_row_ptrs.get_const_data())); - - components::prefix_sum(exec, row_ptrs, num_rows + 1); - - // Fill the value - grid_num = ceildiv(num_rows, default_block_size); - kernel::fill_in_csr<<>>( - num_rows, max_nnz_per_row, stride, as_cuda_type(ell_val), - as_cuda_type(ell_col), as_cuda_type(coo_val), as_cuda_type(coo_col), - as_cuda_type(coo_offset.get_const_data()), as_cuda_type(row_ptrs), - as_cuda_type(result->get_col_idxs()), - as_cuda_type(result->get_values())); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_HYBRID_CONVERT_TO_CSR_KERNEL); - - -template -void count_nonzeros(std::shared_ptr exec, - const matrix::Hybrid* source, - size_type* result) -{ - size_type ell_nnz = 0; - size_type coo_nnz = 0; - ell::count_nonzeros(exec, source->get_ell(), &ell_nnz); - - auto nnz = source->get_coo_num_stored_elements(); - auto nwarps = coo::host_kernel::calculate_nwarps(exec, nnz); - if (nwarps > 0) { - int num_lines = ceildiv(nnz, nwarps * config::warp_size); - const dim3 coo_block(config::warp_size, warps_in_block, 1); - const dim3 coo_grid(ceildiv(nwarps, warps_in_block), 1); - const auto num_rows = source->get_size()[0]; - auto nnz_per_row = Array(exec, num_rows); - components::fill_array(exec, nnz_per_row.get_data(), num_rows, - zero()); - kernel::count_coo_row_nnz<<>>( - nnz, num_lines, as_cuda_type(source->get_coo()->get_const_values()), - as_cuda_type(source->get_coo()->get_const_row_idxs()), - as_cuda_type(nnz_per_row.get_data())); - - coo_nnz = - reduce_add_array(exec, num_rows, nnz_per_row.get_const_data()); - } - - *result = ell_nnz + coo_nnz; -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_HYBRID_COUNT_NONZEROS_KERNEL); - - } // namespace hybrid } // namespace cuda } // namespace kernels diff --git a/cuda/test/matrix/ell_kernels.cpp b/cuda/test/matrix/ell_kernels.cpp index 8ba8b39d91c..0eb9feaa8ea 100644 --- a/cuda/test/matrix/ell_kernels.cpp +++ b/cuda/test/matrix/ell_kernels.cpp @@ -558,37 +558,15 @@ TEST_F(Ell, CalculateNNZPerRowIsEquivalentToRef) { set_up_apply_data(); - gko::Array nnz_per_row; - nnz_per_row.set_executor(ref); - nnz_per_row.resize_and_reset(mtx->get_size()[0]); - - gko::Array dnnz_per_row; - dnnz_per_row.set_executor(cuda); - dnnz_per_row.resize_and_reset(dmtx->get_size()[0]); - - gko::kernels::reference::ell::calculate_nonzeros_per_row(ref, mtx.get(), - &nnz_per_row); - gko::kernels::cuda::ell::calculate_nonzeros_per_row(cuda, dmtx.get(), - &dnnz_per_row); - - auto tmp = gko::Array(ref, dnnz_per_row); - for (gko::size_type i = 0; i < nnz_per_row.get_num_elems(); i++) { - ASSERT_EQ(nnz_per_row.get_const_data()[i], tmp.get_const_data()[i]); - } -} - - -TEST_F(Ell, CountNNZIsEquivalentToRef) -{ - set_up_apply_data(); - - gko::size_type nnz; - gko::size_type dnnz; + gko::Array nnz_per_row{ref, mtx->get_size()[0]}; + gko::Array dnnz_per_row{cuda, dmtx->get_size()[0]}; - gko::kernels::reference::ell::count_nonzeros(ref, mtx.get(), &nnz); - gko::kernels::cuda::ell::count_nonzeros(cuda, dmtx.get(), &dnnz); + gko::kernels::reference::ell::count_nonzeros_per_row( + ref, mtx.get(), nnz_per_row.get_data()); + gko::kernels::cuda::ell::count_nonzeros_per_row(cuda, dmtx.get(), + dnnz_per_row.get_data()); - ASSERT_EQ(nnz, dnnz); + GKO_ASSERT_ARRAY_EQ(nnz_per_row, dnnz_per_row); } diff --git a/cuda/test/matrix/hybrid_kernels.cpp b/cuda/test/matrix/hybrid_kernels.cpp index e4ce76e6eb0..74b2e9b2eb8 100644 --- a/cuda/test/matrix/hybrid_kernels.cpp +++ b/cuda/test/matrix/hybrid_kernels.cpp @@ -210,19 +210,6 @@ TEST_F(Hybrid, AdvancedApplyToComplexIsEquivalentToRef) } -TEST_F(Hybrid, CountNonzerosIsEquivalentToRef) -{ - set_up_apply_data(); - gko::size_type nonzeros; - gko::size_type dnonzeros; - - gko::kernels::reference::hybrid::count_nonzeros(ref, mtx.get(), &nonzeros); - gko::kernels::cuda::hybrid::count_nonzeros(cuda, dmtx.get(), &dnonzeros); - - ASSERT_EQ(nonzeros, dnonzeros); -} - - TEST_F(Hybrid, ConvertToCsrIsEquivalentToRef) { set_up_apply_data(1, std::make_shared(2)); diff --git a/dpcpp/matrix/ell_kernels.dp.cpp b/dpcpp/matrix/ell_kernels.dp.cpp index b22873e6c74..bed34ccb051 100644 --- a/dpcpp/matrix/ell_kernels.dp.cpp +++ b/dpcpp/matrix/ell_kernels.dp.cpp @@ -579,81 +579,6 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL); -template -void convert_to_csr(std::shared_ptr exec, - const matrix::Ell* source, - matrix::Csr* result) -{ - auto num_rows = result->get_size()[0]; - - auto row_ptrs = result->get_row_ptrs(); - auto col_idxs = result->get_col_idxs(); - auto values = result->get_values(); - - const auto stride = source->get_stride(); - const auto max_nnz_per_row = source->get_num_stored_elements_per_row(); - - constexpr auto rows_per_block = - ceildiv(default_block_size, config::warp_size); - const auto grid_dim_nnz = ceildiv(source->get_size()[0], rows_per_block); - - kernel::count_nnz_per_row(grid_dim_nnz, default_block_size, 0, - exec->get_queue(), num_rows, max_nnz_per_row, - stride, source->get_const_values(), row_ptrs); - - components::prefix_sum(exec, row_ptrs, num_rows + 1); - - size_type grid_dim = ceildiv(num_rows, default_block_size); - - kernel::fill_in_csr( - grid_dim, default_block_size, 0, exec->get_queue(), num_rows, - max_nnz_per_row, stride, source->get_const_values(), - source->get_const_col_idxs(), row_ptrs, col_idxs, values); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_ELL_CONVERT_TO_CSR_KERNEL); - - -template -void count_nonzeros(std::shared_ptr exec, - const matrix::Ell* source, - size_type* result) -{ - const auto num_rows = source->get_size()[0]; - auto nnz_per_row = Array(exec, num_rows); - - calculate_nonzeros_per_row(exec, source, &nnz_per_row); - - *result = reduce_add_array(exec, num_rows, nnz_per_row.get_const_data()); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_ELL_COUNT_NONZEROS_KERNEL); - - -template -void calculate_nonzeros_per_row(std::shared_ptr exec, - const matrix::Ell* source, - Array* result) -{ - const auto num_rows = source->get_size()[0]; - const auto max_nnz_per_row = source->get_num_stored_elements_per_row(); - const auto stride = source->get_stride(); - const auto values = source->get_const_values(); - - const auto warp_size = config::warp_size; - const auto grid_dim = ceildiv(num_rows * warp_size, default_block_size); - - kernel::count_nnz_per_row(grid_dim, default_block_size, 0, - exec->get_queue(), num_rows, max_nnz_per_row, - stride, values, result->get_data()); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_ELL_CALCULATE_NONZEROS_PER_ROW_KERNEL); - - template void extract_diagonal(std::shared_ptr exec, const matrix::Ell* orig, diff --git a/dpcpp/matrix/hybrid_kernels.dp.cpp b/dpcpp/matrix/hybrid_kernels.dp.cpp index 223eb0a6266..99509cfe2fc 100644 --- a/dpcpp/matrix/hybrid_kernels.dp.cpp +++ b/dpcpp/matrix/hybrid_kernels.dp.cpp @@ -218,106 +218,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_HYBRID_SPLIT_MATRIX_DATA_KERNEL); -template -void convert_to_csr(std::shared_ptr exec, - const matrix::Hybrid* source, - matrix::Csr* result) -{ - const auto num_rows = source->get_size()[0]; - auto coo_offset = Array(exec, num_rows + 1); - auto coo_val = source->get_const_coo_values(); - auto coo_col = source->get_const_coo_col_idxs(); - auto coo_row = source->get_const_coo_row_idxs(); - auto ell_val = source->get_const_ell_values(); - auto ell_col = source->get_const_ell_col_idxs(); - const auto stride = source->get_ell_stride(); - const auto max_nnz_per_row = source->get_ell_num_stored_elements_per_row(); - const auto coo_num_stored_elements = source->get_coo_num_stored_elements(); - - // Compute the row offset of Coo without zeros - size_type grid_num = ceildiv(coo_num_stored_elements, default_block_size); - coo::kernel::convert_row_idxs_to_ptrs( - grid_num, default_block_size, 0, exec->get_queue(), coo_row, - coo_num_stored_elements, coo_offset.get_data(), num_rows + 1); - - // Compute the row ptrs of Csr - auto row_ptrs = result->get_row_ptrs(); - auto coo_row_ptrs = Array(exec, num_rows); - - components::fill_array(exec, row_ptrs, num_rows + 1, zero()); - grid_num = ceildiv(num_rows, warps_in_block); - ell::kernel::count_nnz_per_row(grid_num, default_block_size, 0, - exec->get_queue(), num_rows, max_nnz_per_row, - stride, ell_val, row_ptrs); - - components::fill_array(exec, coo_row_ptrs.get_data(), num_rows, - zero()); - - auto nwarps = - coo::host_kernel::calculate_nwarps(exec, coo_num_stored_elements); - if (nwarps > 0) { - int num_lines = - ceildiv(coo_num_stored_elements, nwarps * config::warp_size); - const dim3 coo_block(config::warp_size, warps_in_block, 1); - const dim3 coo_grid(ceildiv(nwarps, warps_in_block), 1); - - kernel::count_coo_row_nnz(coo_grid, coo_block, 0, exec->get_queue(), - coo_num_stored_elements, num_lines, coo_val, - coo_row, coo_row_ptrs.get_data()); - } - - kernel::add(grid_num, default_block_size, 0, exec->get_queue(), num_rows, - row_ptrs, coo_row_ptrs.get_const_data()); - - components::prefix_sum(exec, row_ptrs, num_rows + 1); - - // Fill the value - grid_num = ceildiv(num_rows, default_block_size); - kernel::fill_in_csr(grid_num, default_block_size, 0, exec->get_queue(), - num_rows, max_nnz_per_row, stride, ell_val, ell_col, - coo_val, coo_col, coo_offset.get_const_data(), row_ptrs, - result->get_col_idxs(), result->get_values()); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_HYBRID_CONVERT_TO_CSR_KERNEL); - - -template -void count_nonzeros(std::shared_ptr exec, - const matrix::Hybrid* source, - size_type* result) -{ - size_type ell_nnz = 0; - size_type coo_nnz = 0; - ell::count_nonzeros(exec, source->get_ell(), &ell_nnz); - - auto nnz = source->get_coo_num_stored_elements(); - auto nwarps = coo::host_kernel::calculate_nwarps(exec, nnz); - if (nwarps > 0) { - int num_lines = ceildiv(nnz, nwarps * config::warp_size); - const dim3 coo_block(config::warp_size, warps_in_block, 1); - const dim3 coo_grid(ceildiv(nwarps, warps_in_block), 1); - const auto num_rows = source->get_size()[0]; - auto nnz_per_row = Array(exec, num_rows); - components::fill_array(exec, nnz_per_row.get_data(), num_rows, - zero()); - kernel::count_coo_row_nnz( - coo_grid, coo_block, 0, exec->get_queue(), nnz, num_lines, - source->get_coo()->get_const_values(), - source->get_coo()->get_const_row_idxs(), nnz_per_row.get_data()); - - coo_nnz = - reduce_add_array(exec, num_rows, nnz_per_row.get_const_data()); - } - - *result = ell_nnz + coo_nnz; -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_HYBRID_COUNT_NONZEROS_KERNEL); - - } // namespace hybrid } // namespace dpcpp } // namespace kernels diff --git a/hip/matrix/ell_kernels.hip.cpp b/hip/matrix/ell_kernels.hip.cpp index 2d5daeb33ac..5c90e03adfb 100644 --- a/hip/matrix/ell_kernels.hip.cpp +++ b/hip/matrix/ell_kernels.hip.cpp @@ -290,86 +290,6 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL); -template -void convert_to_csr(std::shared_ptr exec, - const matrix::Ell* source, - matrix::Csr* result) -{ - auto num_rows = result->get_size()[0]; - - auto row_ptrs = result->get_row_ptrs(); - auto col_idxs = result->get_col_idxs(); - auto values = result->get_values(); - - const auto stride = source->get_stride(); - const auto max_nnz_per_row = source->get_num_stored_elements_per_row(); - - constexpr auto rows_per_block = - ceildiv(default_block_size, config::warp_size); - const auto grid_dim_nnz = ceildiv(source->get_size()[0], rows_per_block); - - hipLaunchKernelGGL( - kernel::count_nnz_per_row, dim3(grid_dim_nnz), dim3(default_block_size), - 0, 0, num_rows, max_nnz_per_row, stride, - as_hip_type(source->get_const_values()), as_hip_type(row_ptrs)); - - size_type grid_dim = ceildiv(num_rows + 1, default_block_size); - auto add_values = Array(exec, grid_dim); - - components::prefix_sum(exec, row_ptrs, num_rows + 1); - - hipLaunchKernelGGL( - kernel::fill_in_csr, dim3(grid_dim), dim3(default_block_size), 0, 0, - num_rows, max_nnz_per_row, stride, - as_hip_type(source->get_const_values()), - as_hip_type(source->get_const_col_idxs()), as_hip_type(row_ptrs), - as_hip_type(col_idxs), as_hip_type(values)); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_ELL_CONVERT_TO_CSR_KERNEL); - - -template -void count_nonzeros(std::shared_ptr exec, - const matrix::Ell* source, - size_type* result) -{ - const auto num_rows = source->get_size()[0]; - auto nnz_per_row = Array(exec, num_rows); - - calculate_nonzeros_per_row(exec, source, &nnz_per_row); - - *result = reduce_add_array(exec, num_rows, nnz_per_row.get_const_data()); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_ELL_COUNT_NONZEROS_KERNEL); - - -template -void calculate_nonzeros_per_row(std::shared_ptr exec, - const matrix::Ell* source, - Array* result) -{ - const auto num_rows = source->get_size()[0]; - const auto max_nnz_per_row = source->get_num_stored_elements_per_row(); - const auto stride = source->get_stride(); - const auto values = source->get_const_values(); - - const auto warp_size = config::warp_size; - const auto grid_dim = ceildiv(num_rows * warp_size, default_block_size); - - hipLaunchKernelGGL(kernel::count_nnz_per_row, dim3(grid_dim), - dim3(default_block_size), 0, 0, num_rows, - max_nnz_per_row, stride, as_hip_type(values), - as_hip_type(result->get_data())); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_ELL_CALCULATE_NONZEROS_PER_ROW_KERNEL); - - template void extract_diagonal(std::shared_ptr exec, const matrix::Ell* orig, diff --git a/hip/matrix/hybrid_kernels.hip.cpp b/hip/matrix/hybrid_kernels.hip.cpp index 08b3f80c951..7883b4ce190 100644 --- a/hip/matrix/hybrid_kernels.hip.cpp +++ b/hip/matrix/hybrid_kernels.hip.cpp @@ -79,110 +79,6 @@ constexpr int warps_in_block = 4; #include "common/cuda_hip/matrix/hybrid_kernels.hpp.inc" -template -void convert_to_csr(std::shared_ptr exec, - const matrix::Hybrid* source, - matrix::Csr* result) -{ - const auto num_rows = source->get_size()[0]; - auto coo_offset = Array(exec, num_rows + 1); - auto coo_val = source->get_const_coo_values(); - auto coo_col = source->get_const_coo_col_idxs(); - auto coo_row = source->get_const_coo_row_idxs(); - auto ell_val = source->get_const_ell_values(); - auto ell_col = source->get_const_ell_col_idxs(); - const auto stride = source->get_ell_stride(); - const auto max_nnz_per_row = source->get_ell_num_stored_elements_per_row(); - const auto coo_num_stored_elements = source->get_coo_num_stored_elements(); - - // Compute the row offset of Coo without zeros - components::convert_idxs_to_ptrs(exec, coo_row, coo_num_stored_elements, - num_rows, coo_offset.get_data()); - - // Compute the row ptrs of Csr - auto row_ptrs = result->get_row_ptrs(); - auto coo_row_ptrs = Array(exec, num_rows); - - components::fill_array(exec, row_ptrs, num_rows + 1, zero()); - size_type grid_num = ceildiv(num_rows, warps_in_block); - hipLaunchKernelGGL(ell::kernel::count_nnz_per_row, dim3(grid_num), - dim3(default_block_size), 0, 0, num_rows, - max_nnz_per_row, stride, as_hip_type(ell_val), - as_hip_type(row_ptrs)); - - components::fill_array(exec, coo_row_ptrs.get_data(), num_rows, - zero()); - - auto nwarps = - coo::host_kernel::calculate_nwarps(exec, coo_num_stored_elements); - if (nwarps > 0) { - int num_lines = - ceildiv(coo_num_stored_elements, nwarps * config::warp_size); - const dim3 coo_block(config::warp_size, warps_in_block, 1); - const dim3 coo_grid(ceildiv(nwarps, warps_in_block), 1); - - hipLaunchKernelGGL( - kernel::count_coo_row_nnz, dim3(coo_grid), dim3(coo_block), 0, 0, - coo_num_stored_elements, num_lines, as_hip_type(coo_val), - as_hip_type(coo_row), as_hip_type(coo_row_ptrs.get_data())); - } - - hipLaunchKernelGGL(kernel::add, dim3(grid_num), dim3(default_block_size), 0, - 0, num_rows, as_hip_type(row_ptrs), - as_hip_type(coo_row_ptrs.get_const_data())); - - components::prefix_sum(exec, row_ptrs, num_rows + 1); - - // Fill the value - grid_num = ceildiv(num_rows, default_block_size); - hipLaunchKernelGGL( - kernel::fill_in_csr, dim3(grid_num), dim3(default_block_size), 0, 0, - num_rows, max_nnz_per_row, stride, as_hip_type(ell_val), - as_hip_type(ell_col), as_hip_type(coo_val), as_hip_type(coo_col), - as_hip_type(coo_offset.get_const_data()), as_hip_type(row_ptrs), - as_hip_type(result->get_col_idxs()), as_hip_type(result->get_values())); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_HYBRID_CONVERT_TO_CSR_KERNEL); - - -template -void count_nonzeros(std::shared_ptr exec, - const matrix::Hybrid* source, - size_type* result) -{ - size_type ell_nnz = 0; - size_type coo_nnz = 0; - ell::count_nonzeros(exec, source->get_ell(), &ell_nnz); - - auto nnz = source->get_coo_num_stored_elements(); - auto nwarps = coo::host_kernel::calculate_nwarps(exec, nnz); - if (nwarps > 0) { - int num_lines = ceildiv(nnz, nwarps * config::warp_size); - const dim3 coo_block(config::warp_size, warps_in_block, 1); - const dim3 coo_grid(ceildiv(nwarps, warps_in_block), 1); - const auto num_rows = source->get_size()[0]; - auto nnz_per_row = Array(exec, num_rows); - components::fill_array(exec, nnz_per_row.get_data(), num_rows, - zero()); - hipLaunchKernelGGL(kernel::count_coo_row_nnz, dim3(coo_grid), - dim3(coo_block), 0, 0, nnz, num_lines, - as_hip_type(source->get_coo()->get_const_values()), - as_hip_type(source->get_coo()->get_const_row_idxs()), - as_hip_type(nnz_per_row.get_data())); - - coo_nnz = - reduce_add_array(exec, num_rows, nnz_per_row.get_const_data()); - } - - *result = ell_nnz + coo_nnz; -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_HYBRID_COUNT_NONZEROS_KERNEL); - - } // namespace hybrid } // namespace hip } // namespace kernels diff --git a/hip/test/matrix/ell_kernels.hip.cpp b/hip/test/matrix/ell_kernels.hip.cpp index d18efe1d2c5..6a1debcee71 100644 --- a/hip/test/matrix/ell_kernels.hip.cpp +++ b/hip/test/matrix/ell_kernels.hip.cpp @@ -559,37 +559,15 @@ TEST_F(Ell, CalculateNNZPerRowIsEquivalentToRef) { set_up_apply_data(); - gko::Array nnz_per_row; - nnz_per_row.set_executor(ref); - nnz_per_row.resize_and_reset(mtx->get_size()[0]); - - gko::Array dnnz_per_row; - dnnz_per_row.set_executor(hip); - dnnz_per_row.resize_and_reset(dmtx->get_size()[0]); - - gko::kernels::reference::ell::calculate_nonzeros_per_row(ref, mtx.get(), - &nnz_per_row); - gko::kernels::hip::ell::calculate_nonzeros_per_row(hip, dmtx.get(), - &dnnz_per_row); - - auto tmp = gko::Array(ref, dnnz_per_row); - for (auto i = 0; i < nnz_per_row.get_num_elems(); i++) { - ASSERT_EQ(nnz_per_row.get_const_data()[i], tmp.get_const_data()[i]); - } -} - - -TEST_F(Ell, CountNNZIsEquivalentToRef) -{ - set_up_apply_data(); - - gko::size_type nnz; - gko::size_type dnnz; + gko::Array nnz_per_row{ref, mtx->get_size()[0]}; + gko::Array dnnz_per_row{hip, dmtx->get_size()[0]}; - gko::kernels::reference::ell::count_nonzeros(ref, mtx.get(), &nnz); - gko::kernels::hip::ell::count_nonzeros(hip, dmtx.get(), &dnnz); + gko::kernels::reference::ell::count_nonzeros_per_row( + ref, mtx.get(), nnz_per_row.get_data()); + gko::kernels::hip::ell::count_nonzeros_per_row(hip, dmtx.get(), + dnnz_per_row.get_data()); - ASSERT_EQ(nnz, dnnz); + GKO_ASSERT_ARRAY_EQ(nnz_per_row, dnnz_per_row); } diff --git a/hip/test/matrix/hybrid_kernels.hip.cpp b/hip/test/matrix/hybrid_kernels.hip.cpp index 19d82664bc4..e8a64514f06 100644 --- a/hip/test/matrix/hybrid_kernels.hip.cpp +++ b/hip/test/matrix/hybrid_kernels.hip.cpp @@ -210,19 +210,6 @@ TEST_F(Hybrid, AdvancedApplyToComplexIsEquivalentToRef) } -TEST_F(Hybrid, CountNonzerosIsEquivalentToRef) -{ - set_up_apply_data(); - gko::size_type nonzeros; - gko::size_type dnonzeros; - - gko::kernels::reference::hybrid::count_nonzeros(ref, mtx.get(), &nonzeros); - gko::kernels::hip::hybrid::count_nonzeros(hip, dmtx.get(), &dnonzeros); - - ASSERT_EQ(nonzeros, dnonzeros); -} - - TEST_F(Hybrid, ConvertToCsrIsEquivalentToRef) { set_up_apply_data(1, std::make_shared(2)); diff --git a/include/ginkgo/core/matrix/csr.hpp b/include/ginkgo/core/matrix/csr.hpp index c325bfbab65..160e6d9e0e8 100644 --- a/include/ginkgo/core/matrix/csr.hpp +++ b/include/ginkgo/core/matrix/csr.hpp @@ -64,6 +64,9 @@ class SparsityCsr; template class Csr; +template +class Fbcsr; + template class CsrBuilder; @@ -141,6 +144,7 @@ class Csr : public EnableLinOp>, friend class Hybrid; friend class Sellp; friend class SparsityCsr; + friend class Fbcsr; friend class CsrBuilder; friend class Csr, IndexType>; @@ -1029,6 +1033,8 @@ class Csr : public EnableLinOp>, this->make_srow(); } + void resize(gko::dim<2> new_size, size_type nnz); + void apply_impl(const LinOp* b, LinOp* x) const override; void apply_impl(const LinOp* alpha, const LinOp* b, const LinOp* beta, diff --git a/omp/matrix/ell_kernels.cpp b/omp/matrix/ell_kernels.cpp index 5906fabab46..528c502fa10 100644 --- a/omp/matrix/ell_kernels.cpp +++ b/omp/matrix/ell_kernels.cpp @@ -258,48 +258,6 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL); -template -void convert_to_csr(std::shared_ptr exec, - const matrix::Ell* source, - matrix::Csr* result) - GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_ELL_CONVERT_TO_CSR_KERNEL); - -template -void count_nonzeros(std::shared_ptr exec, - const matrix::Ell* source, - size_type* result) -{ - size_type nonzeros = 0; - const auto num_rows = source->get_size()[0]; - const auto max_nnz_per_row = source->get_num_stored_elements_per_row(); - const auto stride = source->get_stride(); - -#pragma omp parallel for reduction(+ : nonzeros) - for (size_type row = 0; row < num_rows; row++) { - for (size_type i = 0; i < max_nnz_per_row; i++) { - nonzeros += (source->val_at(row, i) != zero()); - } - } - - *result = nonzeros; -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_ELL_COUNT_NONZEROS_KERNEL); - - -template -void calculate_nonzeros_per_row(std::shared_ptr exec, - const matrix::Ell* source, - Array* result) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_ELL_CALCULATE_NONZEROS_PER_ROW_KERNEL); - - template void extract_diagonal(std::shared_ptr exec, const matrix::Ell* orig, diff --git a/omp/matrix/hybrid_kernels.cpp b/omp/matrix/hybrid_kernels.cpp index 1270fe63b7b..083975cdc96 100644 --- a/omp/matrix/hybrid_kernels.cpp +++ b/omp/matrix/hybrid_kernels.cpp @@ -95,91 +95,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_HYBRID_SPLIT_MATRIX_DATA_KERNEL); -template -void convert_to_csr(std::shared_ptr exec, - const matrix::Hybrid* source, - matrix::Csr* result) -{ - auto csr_val = result->get_values(); - auto csr_col_idxs = result->get_col_idxs(); - auto csr_row_ptrs = result->get_row_ptrs(); - const auto ell = source->get_ell(); - const auto max_nnz_per_row = ell->get_num_stored_elements_per_row(); - const auto coo_val = source->get_const_coo_values(); - const auto coo_col = source->get_const_coo_col_idxs(); - const auto coo_row = source->get_const_coo_row_idxs(); - const auto coo_nnz = source->get_coo_num_stored_elements(); - const auto num_rows = source->get_size()[0]; - auto coo_row_ptrs_array = Array(exec, num_rows + 1); - auto coo_row_ptrs = coo_row_ptrs_array.get_data(); - components::convert_idxs_to_ptrs(exec, coo_row, coo_nnz, num_rows, - coo_row_ptrs); - - // Compute the row sizes of Coo without zeros -#pragma omp parallel for - for (size_type row = 0; row < num_rows; row++) { - IndexType nonzeros{}; - for (auto j = coo_row_ptrs[row]; j < coo_row_ptrs[row + 1]; j++) { - nonzeros += coo_val[j] != zero(); - } - for (size_type col = 0; col < max_nnz_per_row; col++) { - nonzeros += (ell->val_at(row, col) != zero()); - } - csr_row_ptrs[row] = nonzeros; - } - - components::prefix_sum(exec, csr_row_ptrs, num_rows + 1); - - // Fill in Csr -#pragma omp parallel for - for (IndexType row = 0; row < num_rows; row++) { - // Ell part - auto csr_idx = csr_row_ptrs[row]; - for (IndexType col = 0; col < max_nnz_per_row; col++) { - const auto val = ell->val_at(row, col); - if (val != zero()) { - csr_val[csr_idx] = val; - csr_col_idxs[csr_idx] = ell->col_at(row, col); - csr_idx++; - } - } - // Coo part - for (auto j = coo_row_ptrs[row]; j < coo_row_ptrs[row + 1]; j++) { - if (coo_val[j] != zero()) { - csr_val[csr_idx] = coo_val[j]; - csr_col_idxs[csr_idx] = coo_col[j]; - csr_idx++; - } - } - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_HYBRID_CONVERT_TO_CSR_KERNEL); - - -template -void count_nonzeros(std::shared_ptr exec, - const matrix::Hybrid* source, - size_type* result) -{ - size_type ell_nnz = 0; - size_type coo_nnz = 0; - gko::kernels::omp::ell::count_nonzeros(exec, source->get_ell(), &ell_nnz); - const auto coo_val = source->get_const_coo_values(); - const auto coo_max_nnz = source->get_coo_num_stored_elements(); - -#pragma omp parallel for reduction(+ : coo_nnz) - for (size_type ind = 0; ind < coo_max_nnz; ind++) { - coo_nnz += (coo_val[ind] != zero()); - } - *result = ell_nnz + coo_nnz; -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_HYBRID_COUNT_NONZEROS_KERNEL); - - } // namespace hybrid } // namespace omp } // namespace kernels diff --git a/omp/test/matrix/ell_kernels.cpp b/omp/test/matrix/ell_kernels.cpp index c4d7951fb3a..c1ef88a68cd 100644 --- a/omp/test/matrix/ell_kernels.cpp +++ b/omp/test/matrix/ell_kernels.cpp @@ -518,20 +518,6 @@ TEST_F(Ell, AdvancedApplyToComplexIsEquivalentToRef) } -TEST_F(Ell, CountNonzerosIsEquivalentToRef) -{ - set_up_apply_data(); - - gko::size_type nnz; - gko::size_type dnnz; - - gko::kernels::reference::ell::count_nonzeros(ref, mtx.get(), &nnz); - gko::kernels::omp::ell::count_nonzeros(omp, dmtx.get(), &dnnz); - - ASSERT_EQ(nnz, dnnz); -} - - TEST_F(Ell, ExtractDiagonalIsEquivalentToRef) { set_up_apply_data(); diff --git a/omp/test/matrix/hybrid_kernels.cpp b/omp/test/matrix/hybrid_kernels.cpp index ed6f70c2df0..9ba71b36738 100644 --- a/omp/test/matrix/hybrid_kernels.cpp +++ b/omp/test/matrix/hybrid_kernels.cpp @@ -217,19 +217,6 @@ TEST_F(Hybrid, AdvancedApplyToComplexIsEquivalentToRef) } -TEST_F(Hybrid, CountNonzerosIsEquivalentToRef) -{ - set_up_apply_data(); - gko::size_type nonzeros; - gko::size_type dnonzeros; - - gko::kernels::reference::hybrid::count_nonzeros(ref, mtx.get(), &nonzeros); - gko::kernels::omp::hybrid::count_nonzeros(omp, dmtx.get(), &dnonzeros); - - ASSERT_EQ(nonzeros, dnonzeros); -} - - TEST_F(Hybrid, ConvertEmptyCooToCsrIsEquivalentToRef) { auto balanced_mtx = diff --git a/reference/matrix/ell_kernels.cpp b/reference/matrix/ell_kernels.cpp index c6f5668da91..0c9c014a812 100644 --- a/reference/matrix/ell_kernels.cpp +++ b/reference/matrix/ell_kernels.cpp @@ -246,51 +246,26 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template -void count_nonzeros(std::shared_ptr exec, - const matrix::Ell* source, - size_type* result) -{ - size_type nonzeros = 0; - const auto num_rows = source->get_size()[0]; - const auto max_nnz_per_row = source->get_num_stored_elements_per_row(); - const auto stride = source->get_stride(); - - for (size_type row = 0; row < num_rows; row++) { - for (size_type i = 0; i < max_nnz_per_row; i++) { - nonzeros += (source->val_at(row, i) != zero()); - } - } - - *result = nonzeros; -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_ELL_COUNT_NONZEROS_KERNEL); - - -template -void calculate_nonzeros_per_row(std::shared_ptr exec, - const matrix::Ell* source, - Array* result) +void count_nonzeros_per_row(std::shared_ptr exec, + const matrix::Ell* source, + IndexType* result) { const auto num_rows = source->get_size()[0]; const auto max_nnz_per_row = source->get_num_stored_elements_per_row(); const auto stride = source->get_stride(); - auto row_nnz_val = result->get_data(); - for (size_type row = 0; row < num_rows; row++) { size_type nonzeros_in_this_row = 0; for (size_type i = 0; i < max_nnz_per_row; i++) { nonzeros_in_this_row += (source->val_at(row, i) != zero()); } - row_nnz_val[row] = nonzeros_in_this_row; + result[row] = nonzeros_in_this_row; } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_ELL_CALCULATE_NONZEROS_PER_ROW_KERNEL); + GKO_DECLARE_ELL_COUNT_NONZEROS_PER_ROW_KERNEL); template diff --git a/reference/matrix/hybrid_kernels.cpp b/reference/matrix/hybrid_kernels.cpp index ab181f6203c..d901235a8e1 100644 --- a/reference/matrix/hybrid_kernels.cpp +++ b/reference/matrix/hybrid_kernels.cpp @@ -106,6 +106,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void convert_to_csr(std::shared_ptr exec, const matrix::Hybrid* source, + const IndexType*, const IndexType*, matrix::Csr* result) { auto csr_val = result->get_values(); @@ -145,27 +146,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_HYBRID_CONVERT_TO_CSR_KERNEL); -template -void count_nonzeros(std::shared_ptr exec, - const matrix::Hybrid* source, - size_type* result) -{ - size_type ell_nnz = 0; - size_type coo_nnz = 0; - gko::kernels::reference::ell::count_nonzeros(exec, source->get_ell(), - &ell_nnz); - const auto coo_val = source->get_const_coo_values(); - const auto coo_max_nnz = source->get_coo_num_stored_elements(); - for (size_type ind = 0; ind < coo_max_nnz; ind++) { - coo_nnz += (coo_val[ind] != zero()); - } - *result = ell_nnz + coo_nnz; -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_HYBRID_COUNT_NONZEROS_KERNEL); - - } // namespace hybrid } // namespace reference } // namespace kernels diff --git a/reference/test/matrix/hybrid_kernels.cpp b/reference/test/matrix/hybrid_kernels.cpp index 7a26de2a477..bfcc6c57fc1 100644 --- a/reference/test/matrix/hybrid_kernels.cpp +++ b/reference/test/matrix/hybrid_kernels.cpp @@ -490,17 +490,6 @@ TYPED_TEST(Hybrid, MovesEmptyToCsr) } -TYPED_TEST(Hybrid, CountsNonzeros) -{ - gko::size_type nonzeros; - - gko::kernels::reference::hybrid::count_nonzeros( - this->exec, this->mtx1.get(), &nonzeros); - - ASSERT_EQ(nonzeros, 4); -} - - TYPED_TEST(Hybrid, AppliesWithStrideToDenseVector) { using Vec = typename TestFixture::Vec; From ed864aca6f6846a6a7c49e0be5147b9405e26cb5 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 30 Nov 2021 20:26:16 +0100 Subject: [PATCH 12/32] fix benchmark LinOp warnings --- benchmark/utils/cuda_linops.cu | 75 +++++++++++++++++++++++++++--- benchmark/utils/hip_linops.hip.cpp | 36 +++++++++++--- 2 files changed, 99 insertions(+), 12 deletions(-) diff --git a/benchmark/utils/cuda_linops.cu b/benchmark/utils/cuda_linops.cu index 1c94113aed1..05cf7801429 100644 --- a/benchmark/utils/cuda_linops.cu +++ b/benchmark/utils/cuda_linops.cu @@ -76,12 +76,6 @@ public: } protected: - void apply_impl(const gko::LinOp*, const gko::LinOp*, const gko::LinOp*, - gko::LinOp*) const override - { - GKO_NOT_IMPLEMENTED; - } - CusparseBase(std::shared_ptr exec, const gko::dim<2>& size = gko::dim<2>{}) : gko::LinOp(exec, size) @@ -143,6 +137,12 @@ class CusparseCsrmp public: using csr = gko::matrix::Csr; using mat_data = gko::matrix_data; + using device_mat_data = gko::device_matrix_data; + + void read(const device_mat_data& data) override + { + this->read(data.copy_to_host()); + } void read(const mat_data& data) override { @@ -174,6 +174,10 @@ protected: &scalars.get_const_data()[1], dx); } + void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b, + const gko::LinOp* beta, + gko::LinOp* x) const override GKO_NOT_IMPLEMENTED; + CusparseCsrmp(std::shared_ptr exec, const gko::dim<2>& size = gko::dim<2>{}) : gko::EnableLinOp(exec, size), @@ -203,6 +207,12 @@ class CusparseCsr public: using csr = gko::matrix::Csr; using mat_data = gko::matrix_data; + using device_mat_data = gko::device_matrix_data; + + void read(const device_mat_data& data) override + { + this->read(data.copy_to_host()); + } void read(const mat_data& data) override { @@ -234,6 +244,10 @@ protected: &scalars.get_const_data()[1], dx); } + void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b, + const gko::LinOp* beta, + gko::LinOp* x) const override GKO_NOT_IMPLEMENTED; + CusparseCsr(std::shared_ptr exec, const gko::dim<2>& size = gko::dim<2>{}) : gko::EnableLinOp(exec, size), @@ -264,6 +278,12 @@ class CusparseCsrmm public: using csr = gko::matrix::Csr; using mat_data = gko::matrix_data; + using device_mat_data = gko::device_matrix_data; + + void read(const device_mat_data& data) override + { + this->read(data.copy_to_host()); + } void read(const mat_data& data) override { @@ -296,6 +316,10 @@ protected: dense_x->get_size()[0]); } + void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b, + const gko::LinOp* beta, + gko::LinOp* x) const override GKO_NOT_IMPLEMENTED; + CusparseCsrmm(std::shared_ptr exec, const gko::dim<2>& size = gko::dim<2>{}) : gko::EnableLinOp(exec, size), @@ -329,6 +353,12 @@ class CusparseCsrEx public: using csr = gko::matrix::Csr; using mat_data = gko::matrix_data; + using device_mat_data = gko::device_matrix_data; + + void read(const device_mat_data& data) override + { + this->read(data.copy_to_host()); + } void read(const mat_data& data) override { @@ -380,6 +410,9 @@ protected: // DEVICE for Ginkgo } + void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b, + const gko::LinOp* beta, + gko::LinOp* x) const override GKO_NOT_IMPLEMENTED; CusparseCsrEx(std::shared_ptr exec, const gko::dim<2>& size = gko::dim<2>{}) @@ -422,6 +455,12 @@ class CusparseHybrid public: using csr = gko::matrix::Csr; using mat_data = gko::matrix_data; + using device_mat_data = gko::device_matrix_data; + + void read(const device_mat_data& data) override + { + this->read(data.copy_to_host()); + } void read(const mat_data& data) override { @@ -471,6 +510,10 @@ protected: &scalars.get_const_data()[1], dx); } + void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b, + const gko::LinOp* beta, + gko::LinOp* x) const override GKO_NOT_IMPLEMENTED; + CusparseHybrid(std::shared_ptr exec, const gko::dim<2>& size = gko::dim<2>{}) : gko::EnableLinOp(exec, size), @@ -551,10 +594,16 @@ class CusparseGenericCsr public: using csr = gko::matrix::Csr; using mat_data = gko::matrix_data; + using device_mat_data = gko::device_matrix_data; cusparseIndexType_t cu_index = gko::kernels::cuda::cusparse_index_type(); cudaDataType_t cu_value = gko::kernels::cuda::cuda_data_type(); + void read(const device_mat_data& data) override + { + this->read(data.copy_to_host()); + } + void read(const mat_data& data) override { using gko::kernels::cuda::as_culibs_type; @@ -598,6 +647,10 @@ protected: Alg); } + void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b, + const gko::LinOp* beta, + gko::LinOp* x) const override GKO_NOT_IMPLEMENTED; + CusparseGenericCsr(std::shared_ptr exec, const gko::dim<2>& size = gko::dim<2>{}) : gko::EnableLinOp(exec, size), @@ -629,10 +682,16 @@ class CusparseGenericCoo public: using coo = gko::matrix::Coo; using mat_data = gko::matrix_data; + using device_mat_data = gko::device_matrix_data; cusparseIndexType_t cu_index = gko::kernels::cuda::cusparse_index_type(); cudaDataType_t cu_value = gko::kernels::cuda::cuda_data_type(); + void read(const device_mat_data& data) override + { + this->read(data.copy_to_host()); + } + void read(const mat_data& data) override { using gko::kernels::cuda::as_culibs_type; @@ -676,6 +735,10 @@ protected: CUSPARSE_MV_ALG_DEFAULT); } + void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b, + const gko::LinOp* beta, + gko::LinOp* x) const override GKO_NOT_IMPLEMENTED; + CusparseGenericCoo(std::shared_ptr exec, const gko::dim<2>& size = gko::dim<2>{}) : gko::EnableLinOp(exec, size), diff --git a/benchmark/utils/hip_linops.hip.cpp b/benchmark/utils/hip_linops.hip.cpp index 34bb4e3f164..866e321c7a8 100644 --- a/benchmark/utils/hip_linops.hip.cpp +++ b/benchmark/utils/hip_linops.hip.cpp @@ -65,12 +65,6 @@ class HipsparseBase : public gko::LinOp { const gko::HipExecutor* get_gpu_exec() const { return gpu_exec_.get(); } protected: - void apply_impl(const gko::LinOp*, const gko::LinOp*, const gko::LinOp*, - gko::LinOp*) const override - { - GKO_NOT_IMPLEMENTED; - } - HipsparseBase(std::shared_ptr exec, const gko::dim<2>& size = gko::dim<2>{}) : gko::LinOp(exec, size) @@ -130,6 +124,12 @@ class HipsparseCsr public: using csr = gko::matrix::Csr; using mat_data = gko::matrix_data; + using device_mat_data = gko::device_matrix_data; + + void read(const device_mat_data& data) override + { + this->read(data.copy_to_host()); + } void read(const mat_data& data) override { @@ -161,6 +161,10 @@ class HipsparseCsr &scalars.get_const_data()[1], dx); } + void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b, + const gko::LinOp* beta, + gko::LinOp* x) const override GKO_NOT_IMPLEMENTED; + HipsparseCsr(std::shared_ptr exec, const gko::dim<2>& size = gko::dim<2>{}) : gko::EnableLinOp(exec, size), @@ -191,6 +195,12 @@ class HipsparseCsrmm public: using csr = gko::matrix::Csr; using mat_data = gko::matrix_data; + using device_mat_data = gko::device_matrix_data; + + void read(const device_mat_data& data) override + { + this->read(data.copy_to_host()); + } void read(const mat_data& data) override { @@ -223,6 +233,10 @@ class HipsparseCsrmm dense_x->get_size()[0]); } + void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b, + const gko::LinOp* beta, + gko::LinOp* x) const override GKO_NOT_IMPLEMENTED; + HipsparseCsrmm(std::shared_ptr exec, const gko::dim<2>& size = gko::dim<2>{}) : gko::EnableLinOp(exec, size), @@ -257,6 +271,12 @@ class HipsparseHybrid public: using csr = gko::matrix::Csr; using mat_data = gko::matrix_data; + using device_mat_data = gko::device_matrix_data; + + void read(const device_mat_data& data) override + { + this->read(data.copy_to_host()); + } void read(const mat_data& data) override { @@ -306,6 +326,10 @@ class HipsparseHybrid &scalars.get_const_data()[1], dx); } + void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b, + const gko::LinOp* beta, + gko::LinOp* x) const override GKO_NOT_IMPLEMENTED; + HipsparseHybrid(std::shared_ptr exec, const gko::dim<2>& size = gko::dim<2>{}) : gko::EnableLinOp(exec, size), From 0462d4fb9a648511016f00bdaa8efef13fe741f5 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 9 Dec 2021 15:42:32 +0100 Subject: [PATCH 13/32] unify Dense/Csr::convert_to(...) --- common/cuda_hip/matrix/coo_kernels.hpp.inc | 55 -- common/cuda_hip/matrix/csr_kernels.hpp.inc | 260 --------- common/cuda_hip/matrix/dense_kernels.hpp.inc | 434 +++++--------- common/cuda_hip/matrix/ell_kernels.hpp.inc | 105 ---- common/cuda_hip/matrix/fbcsr_kernels.hpp.inc | 44 ++ common/cuda_hip/matrix/hybrid_kernels.hpp.inc | 113 +--- common/cuda_hip/matrix/sellp_kernels.hpp.inc | 147 ----- common/unified/matrix/csr_kernels.cpp | 125 ++++ common/unified/matrix/dense_kernels.cpp | 76 +++ common/unified/matrix/ell_kernels.cpp | 26 + common/unified/matrix/hybrid_kernels.cpp | 16 + common/unified/matrix/sellp_kernels.cpp | 68 ++- core/device_hooks/common_kernels.inc.cpp | 24 +- core/matrix/coo.cpp | 11 + core/matrix/csr.cpp | 122 ++-- core/matrix/csr_kernels.hpp | 125 ++-- core/matrix/dense.cpp | 314 +++++----- core/matrix/dense_kernels.hpp | 43 +- core/matrix/diagonal.cpp | 26 +- core/matrix/ell.cpp | 52 +- core/matrix/fbcsr.cpp | 31 +- core/matrix/fbcsr_kernels.hpp | 57 +- core/matrix/hybrid.cpp | 55 +- core/matrix/hybrid_kernels.hpp | 7 + core/matrix/sellp.cpp | 30 +- core/matrix/sellp_kernels.hpp | 43 +- core/matrix/sparsity_csr.cpp | 43 ++ core/matrix/sparsity_csr_kernels.hpp | 7 + cuda/matrix/csr_kernels.cu | 238 +------- cuda/matrix/dense_kernels.cu | 230 ++------ cuda/matrix/ell_kernels.cu | 24 - cuda/matrix/fbcsr_kernels.cu | 54 +- cuda/matrix/hybrid_kernels.cu | 20 - cuda/matrix/sellp_kernels.cu | 103 ---- cuda/matrix/sparsity_csr_kernels.cu | 9 + cuda/multigrid/amgx_pgm_kernels.cu | 1 - cuda/test/matrix/coo_kernels.cpp | 4 +- cuda/test/matrix/csr_kernels.cpp | 62 +- cuda/test/matrix/dense_kernels.cpp | 62 +- cuda/test/matrix/ell_kernels.cpp | 4 +- cuda/test/matrix/fbcsr_kernels.cpp | 18 - cuda/test/matrix/hybrid_kernels.cpp | 2 +- cuda/test/matrix/sellp_kernels.cpp | 17 +- dpcpp/matrix/csr_kernels.dp.cpp | 535 ------------------ dpcpp/matrix/dense_kernels.dp.cpp | 174 +----- dpcpp/matrix/ell_kernels.dp.cpp | 125 ---- dpcpp/matrix/fbcsr_kernels.dp.cpp | 26 - dpcpp/matrix/hybrid_kernels.dp.cpp | 155 ----- dpcpp/matrix/sellp_kernels.dp.cpp | 210 ------- dpcpp/matrix/sparsity_csr_kernels.dp.cpp | 9 + dpcpp/test/matrix/coo_kernels.cpp | 4 +- dpcpp/test/matrix/csr_kernels.cpp | 64 +-- dpcpp/test/matrix/dense_kernels.cpp | 62 +- dpcpp/test/matrix/ell_kernels.cpp | 35 +- dpcpp/test/matrix/hybrid_kernels.cpp | 17 +- dpcpp/test/matrix/sellp_kernels.cpp | 17 +- hip/matrix/csr_kernels.hip.cpp | 254 --------- hip/matrix/dense_kernels.hip.cpp | 235 ++------ hip/matrix/ell_kernels.hip.cpp | 25 - hip/matrix/fbcsr_kernels.hip.cpp | 32 +- hip/matrix/hybrid_kernels.hip.cpp | 20 - hip/matrix/sellp_kernels.hip.cpp | 103 ---- hip/matrix/sparsity_csr_kernels.hip.cpp | 9 + hip/test/matrix/coo_kernels.hip.cpp | 4 +- hip/test/matrix/csr_kernels.hip.cpp | 62 +- hip/test/matrix/dense_kernels.hip.cpp | 61 +- hip/test/matrix/ell_kernels.hip.cpp | 4 +- hip/test/matrix/hybrid_kernels.hip.cpp | 4 +- hip/test/matrix/sellp_kernels.hip.cpp | 17 +- include/ginkgo/core/matrix/coo.hpp | 8 +- include/ginkgo/core/matrix/csr.hpp | 6 +- include/ginkgo/core/matrix/dense.hpp | 18 + include/ginkgo/core/matrix/ell.hpp | 13 +- include/ginkgo/core/matrix/hybrid.hpp | 2 + include/ginkgo/core/matrix/sparsity_csr.hpp | 16 + omp/matrix/csr_kernels.cpp | 136 ----- omp/matrix/dense_kernels.cpp | 271 ++------- omp/matrix/ell_kernels.cpp | 26 - omp/matrix/fbcsr_kernels.cpp | 47 -- omp/matrix/sellp_kernels.cpp | 59 -- omp/matrix/sparsity_csr_kernels.cpp | 23 + omp/test/matrix/coo_kernels.cpp | 2 +- omp/test/matrix/csr_kernels.cpp | 24 +- omp/test/matrix/dense_kernels.cpp | 102 ++-- omp/test/matrix/fbcsr_kernels.cpp | 32 -- omp/test/matrix/hybrid_kernels.cpp | 2 +- omp/test/matrix/sellp_kernels.cpp | 2 +- reference/matrix/csr_kernels.cpp | 98 +--- reference/matrix/dense_kernels.cpp | 203 +++---- reference/matrix/ell_kernels.cpp | 5 +- reference/matrix/fbcsr_kernels.cpp | 44 -- reference/matrix/hybrid_kernels.cpp | 18 +- reference/matrix/sellp_kernels.cpp | 72 +-- reference/matrix/sparsity_csr_kernels.cpp | 21 + reference/test/matrix/csr_kernels.cpp | 17 +- reference/test/matrix/dense_kernels.cpp | 40 +- reference/test/matrix/fbcsr_kernels.cpp | 33 -- reference/test/matrix/sellp_kernels.cpp | 11 - 98 files changed, 1664 insertions(+), 5257 deletions(-) diff --git a/common/cuda_hip/matrix/coo_kernels.hpp.inc b/common/cuda_hip/matrix/coo_kernels.hpp.inc index 89fcc7c9942..a13068c1ce0 100644 --- a/common/cuda_hip/matrix/coo_kernels.hpp.inc +++ b/common/cuda_hip/matrix/coo_kernels.hpp.inc @@ -218,58 +218,3 @@ __global__ __launch_bounds__(spmv_block_size) void abstract_spmm( } // namespace - - -namespace kernel { - - -template -__global__ __launch_bounds__(default_block_size) void convert_row_idxs_to_ptrs( - const IndexType* __restrict__ idxs, size_type num_nonzeros, - IndexType* __restrict__ ptrs, size_type length) -{ - const auto tidx = thread::get_thread_id_flat(); - - if (tidx == 0) { - ptrs[0] = 0; - ptrs[length - 1] = num_nonzeros; - } - - if (0 < tidx && tidx < num_nonzeros) { - if (idxs[tidx - 1] < idxs[tidx]) { - for (auto i = idxs[tidx - 1] + 1; i <= idxs[tidx]; i++) { - ptrs[i] = tidx; - } - } - } -} - - -template -__global__ __launch_bounds__(config::max_block_size) void initialize_zero_dense( - size_type num_rows, size_type num_cols, size_type stride, - ValueType* __restrict__ result) -{ - const auto tidx_x = threadIdx.x + blockDim.x * blockIdx.x; - const auto tidx_y = threadIdx.y + blockDim.y * blockIdx.y; - if (tidx_x < num_cols && tidx_y < num_rows) { - result[tidx_y * stride + tidx_x] = zero(); - } -} - - -template -__global__ __launch_bounds__(default_block_size) void fill_in_dense( - size_type nnz, const IndexType* __restrict__ row_idxs, - const IndexType* __restrict__ col_idxs, - const ValueType* __restrict__ values, size_type stride, - ValueType* __restrict__ result) -{ - const auto tidx = thread::get_thread_id_flat(); - if (tidx < nnz) { - result[stride * row_idxs[tidx] + col_idxs[tidx]] = values[tidx]; - } -} - - -} // namespace kernel diff --git a/common/cuda_hip/matrix/csr_kernels.hpp.inc b/common/cuda_hip/matrix/csr_kernels.hpp.inc index 104e66ae114..5f14ee9ea0c 100644 --- a/common/cuda_hip/matrix/csr_kernels.hpp.inc +++ b/common/cuda_hip/matrix/csr_kernels.hpp.inc @@ -585,33 +585,6 @@ __global__ __launch_bounds__(default_block_size) void spgeam( } -template -__global__ __launch_bounds__(default_block_size) void convert_row_ptrs_to_idxs( - size_type num_rows, const IndexType* __restrict__ ptrs, - IndexType* __restrict__ idxs) -{ - const auto tidx = thread::get_thread_id_flat(); - if (tidx < num_rows) { - for (auto i = ptrs[tidx]; i < ptrs[tidx + 1]; i++) { - idxs[i] = tidx; - } - } -} - - -template -__global__ __launch_bounds__(config::max_block_size) void initialize_zero_dense( - size_type num_rows, size_type num_cols, size_type stride, - ValueType* __restrict__ result) -{ - const auto tidx_x = threadIdx.x + blockDim.x * blockIdx.x; - const auto tidx_y = threadIdx.y + blockDim.y * blockIdx.y; - if (tidx_x < num_cols && tidx_y < num_rows) { - result[tidx_y * stride + tidx_x] = zero(); - } -} - - template __global__ __launch_bounds__(default_block_size) void fill_in_dense( size_type num_rows, const IndexType* __restrict__ row_ptrs, @@ -628,239 +601,6 @@ __global__ __launch_bounds__(default_block_size) void fill_in_dense( } -template -__global__ __launch_bounds__(default_block_size) void calculate_nnz_per_row( - size_type num_rows, const IndexType* __restrict__ row_ptrs, - size_type* __restrict__ nnz_per_row) -{ - const auto tidx = thread::get_thread_id_flat(); - if (tidx < num_rows) { - nnz_per_row[tidx] = row_ptrs[tidx + 1] - row_ptrs[tidx]; - } -} - - -__global__ __launch_bounds__(config::warp_size) void calculate_slice_lengths( - size_type num_rows, size_type slice_size, size_type stride_factor, - const size_type* __restrict__ nnz_per_row, - size_type* __restrict__ slice_lengths, size_type* __restrict__ slice_sets) -{ - constexpr auto warp_size = config::warp_size; - const auto sliceid = blockIdx.x; - const auto tid_in_warp = threadIdx.x; - - if (sliceid * slice_size + tid_in_warp < num_rows) { - size_type thread_result = 0; - for (int i = tid_in_warp; i < slice_size; i += warp_size) { - thread_result = - (i + slice_size * sliceid < num_rows) - ? max(thread_result, nnz_per_row[sliceid * slice_size + i]) - : thread_result; - } - - auto warp_tile = - group::tiled_partition(group::this_thread_block()); - auto warp_result = reduce( - warp_tile, thread_result, - [](const size_type& a, const size_type& b) { return max(a, b); }); - - if (tid_in_warp == 0) { - auto slice_length = - ceildiv(warp_result, stride_factor) * stride_factor; - slice_lengths[sliceid] = slice_length; - slice_sets[sliceid] = slice_length; - } - } -} - - -template -__global__ __launch_bounds__(default_block_size) void fill_in_sellp( - size_type num_rows, size_type slice_size, - const ValueType* __restrict__ source_values, - const IndexType* __restrict__ source_row_ptrs, - const IndexType* __restrict__ source_col_idxs, - size_type* __restrict__ slice_lengths, size_type* __restrict__ slice_sets, - IndexType* __restrict__ result_col_idxs, - ValueType* __restrict__ result_values) -{ - const auto global_row = thread::get_thread_id_flat(); - const auto row = global_row % slice_size; - const auto sliceid = global_row / slice_size; - - if (global_row < num_rows) { - size_type sellp_ind = slice_sets[sliceid] * slice_size + row; - - for (size_type csr_ind = source_row_ptrs[global_row]; - csr_ind < source_row_ptrs[global_row + 1]; csr_ind++) { - result_values[sellp_ind] = source_values[csr_ind]; - result_col_idxs[sellp_ind] = source_col_idxs[csr_ind]; - sellp_ind += slice_size; - } - for (size_type i = sellp_ind; - i < - (slice_sets[sliceid] + slice_lengths[sliceid]) * slice_size + row; - i += slice_size) { - result_col_idxs[i] = 0; - result_values[i] = zero(); - } - } -} - - -template -__global__ __launch_bounds__(default_block_size) void initialize_zero_ell( - size_type max_nnz_per_row, size_type stride, ValueType* __restrict__ values, - IndexType* __restrict__ col_idxs) -{ - const auto tidx = thread::get_thread_id_flat(); - - if (tidx < stride * max_nnz_per_row) { - values[tidx] = zero(); - col_idxs[tidx] = 0; - } -} - - -template -__global__ __launch_bounds__(default_block_size) void fill_in_ell( - size_type num_rows, size_type stride, - const ValueType* __restrict__ source_values, - const IndexType* __restrict__ source_row_ptrs, - const IndexType* __restrict__ source_col_idxs, - ValueType* __restrict__ result_values, - IndexType* __restrict__ result_col_idxs) -{ - constexpr auto warp_size = config::warp_size; - const auto row = thread::get_subwarp_id_flat(); - const auto local_tidx = threadIdx.x % warp_size; - - if (row < num_rows) { - for (size_type i = local_tidx; - i < source_row_ptrs[row + 1] - source_row_ptrs[row]; - i += warp_size) { - const auto result_idx = row + stride * i; - const auto source_idx = i + source_row_ptrs[row]; - result_values[result_idx] = source_values[source_idx]; - result_col_idxs[result_idx] = source_col_idxs[source_idx]; - } - } -} - - -__global__ __launch_bounds__(default_block_size) void reduce_max_nnz_per_slice( - size_type num_rows, size_type slice_size, size_type stride_factor, - const size_type* __restrict__ nnz_per_row, size_type* __restrict__ result) -{ - constexpr auto warp_size = config::warp_size; - auto warp_tile = - group::tiled_partition(group::this_thread_block()); - const auto warpid = thread::get_subwarp_id_flat(); - const auto tid_in_warp = warp_tile.thread_rank(); - const auto slice_num = ceildiv(num_rows, slice_size); - - size_type thread_result = 0; - for (auto i = tid_in_warp; i < slice_size; i += warp_size) { - if (warpid * slice_size + i < num_rows) { - thread_result = - max(thread_result, nnz_per_row[warpid * slice_size + i]); - } - } - auto warp_result = reduce( - warp_tile, thread_result, - [](const size_type& a, const size_type& b) { return max(a, b); }); - - if (tid_in_warp == 0 && warpid < slice_num) { - result[warpid] = ceildiv(warp_result, stride_factor) * stride_factor; - } -} - - -__global__ __launch_bounds__(default_block_size) void reduce_total_cols( - size_type num_slices, const size_type* __restrict__ max_nnz_per_slice, - size_type* __restrict__ result) -{ - __shared__ size_type block_result[default_block_size]; - - reduce_array(num_slices, max_nnz_per_slice, block_result, - [](const size_type& x, const size_type& y) { return x + y; }); - - if (threadIdx.x == 0) { - result[blockIdx.x] = block_result[0]; - } -} - - -__global__ __launch_bounds__(default_block_size) void reduce_max_nnz( - size_type size, const size_type* __restrict__ nnz_per_row, - size_type* __restrict__ result) -{ - __shared__ size_type block_max[default_block_size]; - - reduce_array( - size, nnz_per_row, block_max, - [](const size_type& x, const size_type& y) { return max(x, y); }); - - if (threadIdx.x == 0) { - result[blockIdx.x] = block_max[0]; - } -} - - -template -__global__ - __launch_bounds__(default_block_size) void calculate_hybrid_coo_row_nnz( - size_type num_rows, size_type ell_max_nnz_per_row, - IndexType* __restrict__ csr_row_idxs, - size_type* __restrict__ coo_row_nnz) -{ - const auto tidx = thread::get_thread_id_flat(); - if (tidx < num_rows) { - const size_type csr_nnz = csr_row_idxs[tidx + 1] - csr_row_idxs[tidx]; - coo_row_nnz[tidx] = - (csr_nnz > ell_max_nnz_per_row) * (csr_nnz - ell_max_nnz_per_row); - } -} - - -template -__global__ __launch_bounds__(default_block_size) void fill_in_hybrid( - size_type num_rows, size_type stride, size_type ell_max_nnz_per_row, - const ValueType* __restrict__ source_values, - const IndexType* __restrict__ source_row_ptrs, - const IndexType* __restrict__ source_col_idxs, - const size_type* __restrict__ coo_offset, - ValueType* __restrict__ result_ell_val, - IndexType* __restrict__ result_ell_col, - ValueType* __restrict__ result_coo_val, - IndexType* __restrict__ result_coo_col, - IndexType* __restrict__ result_coo_row) -{ - constexpr auto warp_size = config::warp_size; - const auto row = thread::get_subwarp_id_flat(); - const auto local_tidx = threadIdx.x % warp_size; - - if (row < num_rows) { - for (size_type i = local_tidx; - i < source_row_ptrs[row + 1] - source_row_ptrs[row]; - i += warp_size) { - const auto source_idx = i + source_row_ptrs[row]; - if (i < ell_max_nnz_per_row) { - const auto result_idx = row + stride * i; - result_ell_val[result_idx] = source_values[source_idx]; - result_ell_col[result_idx] = source_col_idxs[source_idx]; - } else { - const auto result_idx = - coo_offset[row] + i - ell_max_nnz_per_row; - result_coo_val[result_idx] = source_values[source_idx]; - result_coo_col[result_idx] = source_col_idxs[source_idx]; - result_coo_row[result_idx] = row; - } - } - } -} - - template __global__ __launch_bounds__(default_block_size) void check_unsorted( const IndexType* __restrict__ row_ptrs, diff --git a/common/cuda_hip/matrix/dense_kernels.hpp.inc b/common/cuda_hip/matrix/dense_kernels.hpp.inc index 6bc53b7f8ce..5376d690d5d 100644 --- a/common/cuda_hip/matrix/dense_kernels.hpp.inc +++ b/common/cuda_hip/matrix/dense_kernels.hpp.inc @@ -33,170 +33,34 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace kernel { -template -__device__ void compute_partial_reduce(size_type num_rows, - OutType* __restrict__ work, - CallableGetValue get_value, - CallableReduce reduce_op) -{ - constexpr auto warps_per_block = block_size / config::warp_size; - - const auto num_blocks = gridDim.x; - const auto local_id = thread::get_local_thread_id(); - const auto global_id = - thread::get_thread_id(); - - auto tmp = zero(); - for (auto i = global_id; i < num_rows; i += block_size * num_blocks) { - tmp = reduce_op(tmp, get_value(i)); - } - __shared__ UninitializedArray tmp_work; - tmp_work[local_id] = tmp; - - reduce(group::this_thread_block(), static_cast(tmp_work), - reduce_op); - - if (local_id == 0) { - work[thread::get_block_id()] = tmp_work[0]; - } -} - - -template -__device__ void finalize_reduce_computation(size_type size, - const ValueType* work, - ValueType* result, - CallableReduce reduce_op, - CallableFinalize finalize_op) -{ - const auto local_id = thread::get_local_thread_id(); - - ValueType tmp = zero(); - for (auto i = local_id; i < size; i += block_size) { - tmp = reduce_op(tmp, work[i]); - } - __shared__ UninitializedArray tmp_work; - tmp_work[local_id] = tmp; - - reduce(group::this_thread_block(), static_cast(tmp_work), - reduce_op); - - if (local_id == 0) { - *result = finalize_op(tmp_work[0]); - } -} - - -template -__global__ __launch_bounds__(block_size) void compute_partial_dot( - size_type num_rows, const ValueType* __restrict__ x, size_type stride_x, - const ValueType* __restrict__ y, size_type stride_y, - ValueType* __restrict__ work) -{ - compute_partial_reduce( - num_rows, work, - [x, stride_x, y, stride_y](size_type i) { - return x[i * stride_x] * y[i * stride_y]; - }, - [](const ValueType& x, const ValueType& y) { return x + y; }); -} - - -template -__global__ __launch_bounds__(block_size) void compute_partial_conj_dot( - size_type num_rows, const ValueType* __restrict__ x, size_type stride_x, - const ValueType* __restrict__ y, size_type stride_y, - ValueType* __restrict__ work) -{ - compute_partial_reduce( - num_rows, work, - [x, stride_x, y, stride_y](size_type i) { - return conj(x[i * stride_x]) * y[i * stride_y]; - }, - [](const ValueType& x, const ValueType& y) { return x + y; }); -} - - -template -__global__ __launch_bounds__(block_size) void finalize_sum_reduce_computation( - size_type size, const ValueType* work, ValueType* result) -{ - finalize_reduce_computation( - size, work, result, - [](const ValueType& x, const ValueType& y) { return x + y; }, - [](const ValueType& x) { return x; }); -} - - -template -__global__ __launch_bounds__(block_size) void compute_partial_norm2( - size_type num_rows, const ValueType* __restrict__ x, size_type stride_x, - remove_complex* __restrict__ work) -{ - using norm_type = remove_complex; - compute_partial_reduce( - num_rows, work, - [x, stride_x](size_type i) { return squared_norm(x[i * stride_x]); }, - [](const norm_type& x, const norm_type& y) { return x + y; }); -} - - -template -__global__ __launch_bounds__(block_size) void finalize_sqrt_reduce_computation( - size_type size, const ValueType* work, ValueType* result) -{ - finalize_reduce_computation( - size, work, result, - [](const ValueType& x, const ValueType& y) { return x + y; }, - [](const ValueType& x) { return sqrt(x); }); -} - - template __global__ __launch_bounds__(default_block_size) void fill_in_coo( size_type num_rows, size_type num_cols, size_type stride, - const size_type* __restrict__ row_ptrs, - const ValueType* __restrict__ source, IndexType* __restrict__ row_idxs, - IndexType* __restrict__ col_idxs, ValueType* __restrict__ values) -{ - const auto tidx = thread::get_thread_id_flat(); - if (tidx < num_rows) { - size_type write_to = row_ptrs[tidx]; - - for (size_type i = 0; i < num_cols; i++) { - if (source[stride * tidx + i] != zero()) { - values[write_to] = source[stride * tidx + i]; - col_idxs[write_to] = i; - row_idxs[write_to] = tidx; - write_to++; - } - } - } -} - - -template -__global__ __launch_bounds__(default_block_size) void count_nnz_per_row( - size_type num_rows, size_type num_cols, size_type stride, - const ValueType* __restrict__ work, IndexType* __restrict__ result) + const ValueType* __restrict__ source, const int64* __restrict__ row_ptrs, + IndexType* __restrict__ row_idxs, IndexType* __restrict__ col_idxs, + ValueType* __restrict__ values) { - constexpr auto warp_size = config::warp_size; - const auto row_idx = thread::get_subwarp_id_flat(); - auto warp_tile = - group::tiled_partition(group::this_thread_block()); - - if (row_idx < num_rows) { - IndexType part_result{}; - for (auto i = warp_tile.thread_rank(); i < num_cols; i += warp_size) { - if (work[stride * row_idx + i] != zero()) { - part_result += 1; + const auto row = thread::get_subwarp_id_flat(); + + if (row < num_rows) { + auto warp = group::tiled_partition( + group::this_thread_block()); + auto lane_prefix_mask = + (config::lane_mask_type(1) << warp.thread_rank()) - 1; + auto base_out_idx = row_ptrs[row]; + for (size_type i = 0; i < num_cols; i += config::warp_size) { + const auto col = i + warp.thread_rank(); + const auto pred = + col < num_cols ? is_nonzero(source[stride * row + col]) : false; + const auto mask = warp.ballot(pred); + const auto out_idx = base_out_idx + popcnt(mask & lane_prefix_mask); + if (pred) { + values[out_idx] = source[stride * row + col]; + col_idxs[out_idx] = col; + row_idxs[out_idx] = row; } + base_out_idx += popcnt(mask); } - result[row_idx] = reduce( - warp_tile, part_result, - [](const size_type& a, const size_type& b) { return a + b; }); } } @@ -207,16 +71,25 @@ __global__ __launch_bounds__(default_block_size) void fill_in_csr( const ValueType* __restrict__ source, IndexType* __restrict__ row_ptrs, IndexType* __restrict__ col_idxs, ValueType* __restrict__ values) { - const auto tidx = thread::get_thread_id_flat(); - - if (tidx < num_rows) { - auto write_to = row_ptrs[tidx]; - for (size_type i = 0; i < num_cols; i++) { - if (source[stride * tidx + i] != zero()) { - values[write_to] = source[stride * tidx + i]; - col_idxs[write_to] = i; - write_to++; + const auto row = thread::get_subwarp_id_flat(); + + if (row < num_rows) { + auto warp = group::tiled_partition( + group::this_thread_block()); + auto lane_prefix_mask = + (config::lane_mask_type(1) << warp.thread_rank()) - 1; + auto base_out_idx = row_ptrs[row]; + for (size_type i = 0; i < num_cols; i += config::warp_size) { + const auto col = i + warp.thread_rank(); + const auto pred = + col < num_cols ? is_nonzero(source[stride * row + col]) : false; + const auto mask = warp.ballot(pred); + const auto out_idx = base_out_idx + popcnt(mask & lane_prefix_mask); + if (pred) { + values[out_idx] = source[stride * row + col]; + col_idxs[out_idx] = col; } + base_out_idx += popcnt(mask); } } } @@ -226,62 +99,88 @@ template __global__ __launch_bounds__(default_block_size) void fill_in_ell( size_type num_rows, size_type num_cols, size_type source_stride, const ValueType* __restrict__ source, size_type max_nnz_per_row, - size_type result_stride, IndexType* __restrict__ col_ptrs, + size_type result_stride, IndexType* __restrict__ col_idxs, ValueType* __restrict__ values) { - const auto tidx = thread::get_thread_id_flat(); - if (tidx < num_rows) { - IndexType col_idx = 0; - for (size_type col = 0; col < num_cols; col++) { - if (source[tidx * source_stride + col] != zero()) { - col_ptrs[col_idx * result_stride + tidx] = col; - values[col_idx * result_stride + tidx] = - source[tidx * source_stride + col]; - col_idx++; + const auto row = thread::get_subwarp_id_flat(); + + if (row < num_rows) { + auto warp = group::tiled_partition( + group::this_thread_block()); + auto lane_prefix_mask = + (config::lane_mask_type(1) << warp.thread_rank()) - 1; + size_type base_out_idx{}; + for (size_type i = 0; i < num_cols; i += config::warp_size) { + const auto col = i + warp.thread_rank(); + const auto pred = + col < num_cols ? is_nonzero(source[source_stride * row + col]) + : false; + const auto mask = warp.ballot(pred); + const auto out_idx = + row + (base_out_idx + popcnt(mask & lane_prefix_mask)) * + result_stride; + if (pred) { + values[out_idx] = source[source_stride * row + col]; + col_idxs[out_idx] = col; } + base_out_idx += popcnt(mask); } - for (size_type j = col_idx; j < max_nnz_per_row; j++) { - col_ptrs[j * result_stride + tidx] = 0; - values[j * result_stride + tidx] = zero(); - } - } else if (tidx < result_stride) { - for (size_type j = 0; j < max_nnz_per_row; j++) { - col_ptrs[j * result_stride + tidx] = 0; - values[j * result_stride + tidx] = zero(); + for (size_type i = base_out_idx + warp.thread_rank(); + i < max_nnz_per_row; i += config::warp_size) { + const auto out_idx = row + i * result_stride; + values[out_idx] = zero(); + col_idxs[out_idx] = 0; } } } -__global__ __launch_bounds__(config::warp_size) void calculate_slice_lengths( - size_type num_rows, size_type slice_size, int slice_num, - size_type stride_factor, const size_type* __restrict__ nnz_per_row, - size_type* __restrict__ slice_lengths, size_type* __restrict__ slice_sets) -{ - constexpr auto warp_size = config::warp_size; - const auto sliceid = blockIdx.x; - const auto tid_in_warp = threadIdx.x; - - if (sliceid * slice_size + tid_in_warp < num_rows) { - size_type thread_result = 0; - for (size_type i = tid_in_warp; i < slice_size; i += warp_size) { - thread_result = - (i + slice_size * sliceid < num_rows) - ? max(thread_result, nnz_per_row[sliceid * slice_size + i]) - : thread_result; +template +__global__ __launch_bounds__(default_block_size) void fill_in_hybrid( + size_type num_rows, size_type num_cols, size_type source_stride, + const ValueType* __restrict__ source, size_type ell_max_nnz_per_row, + size_type ell_stride, IndexType* __restrict__ ell_col_idxs, + ValueType* __restrict__ ell_values, const int64* __restrict__ coo_row_ptrs, + IndexType* __restrict__ coo_row_idxs, IndexType* __restrict__ coo_col_idxs, + ValueType* __restrict__ coo_values) +{ + const auto row = thread::get_subwarp_id_flat(); + + if (row < num_rows) { + auto warp = group::tiled_partition( + group::this_thread_block()); + auto lane_prefix_mask = + (config::lane_mask_type(1) << warp.thread_rank()) - 1; + size_type base_out_idx{}; + const auto coo_out_begin = coo_row_ptrs[row]; + for (size_type i = 0; i < num_cols; i += config::warp_size) { + const auto col = i + warp.thread_rank(); + const auto pred = + col < num_cols ? is_nonzero(source[source_stride * row + col]) + : false; + const auto mask = warp.ballot(pred); + const auto cur_out_idx = + base_out_idx + popcnt(mask & lane_prefix_mask); + if (pred) { + if (cur_out_idx < ell_max_nnz_per_row) { + const auto out_idx = row + cur_out_idx * ell_stride; + ell_values[out_idx] = source[source_stride * row + col]; + ell_col_idxs[out_idx] = col; + } else { + const auto out_idx = + cur_out_idx - ell_max_nnz_per_row + coo_out_begin; + coo_values[out_idx] = source[source_stride * row + col]; + coo_col_idxs[out_idx] = col; + coo_row_idxs[out_idx] = row; + } + } + base_out_idx += popcnt(mask); } - - auto warp_tile = - group::tiled_partition(group::this_thread_block()); - auto warp_result = reduce( - warp_tile, thread_result, - [](const size_type& a, const size_type& b) { return max(a, b); }); - - if (tid_in_warp == 0) { - auto slice_length = - ceildiv(warp_result, stride_factor) * stride_factor; - slice_lengths[sliceid] = slice_length; - slice_sets[sliceid] = slice_length; + for (size_type i = base_out_idx + warp.thread_rank(); + i < ell_max_nnz_per_row; i += config::warp_size) { + const auto out_idx = row + i * ell_stride; + ell_values[out_idx] = zero(); + ell_col_idxs[out_idx] = 0; } } } @@ -292,91 +191,40 @@ __global__ __launch_bounds__(default_block_size) void fill_in_sellp( size_type num_rows, size_type num_cols, size_type slice_size, size_type stride, const ValueType* __restrict__ source, size_type* __restrict__ slice_lengths, size_type* __restrict__ slice_sets, - IndexType* __restrict__ col_idxs, ValueType* __restrict__ vals) + IndexType* __restrict__ col_idxs, ValueType* __restrict__ values) { - const auto global_row = thread::get_thread_id_flat(); - const auto row = global_row % slice_size; - const auto sliceid = global_row / slice_size; - - if (global_row < num_rows) { - size_type sellp_ind = slice_sets[sliceid] * slice_size + row; - - for (size_type col = 0; col < num_cols; col++) { - auto val = source[global_row * stride + col]; - if (val != zero()) { - col_idxs[sellp_ind] = col; - vals[sellp_ind] = val; - sellp_ind += slice_size; + const auto row = thread::get_subwarp_id_flat(); + const auto local_row = row % slice_size; + const auto slice = row / slice_size; + + if (row < num_rows) { + const auto slice_length = slice_lengths[slice]; + auto warp = group::tiled_partition( + group::this_thread_block()); + auto lane_prefix_mask = + (config::lane_mask_type(1) << warp.thread_rank()) - 1; + auto base_out_idx = slice_sets[slice]; + for (size_type i = 0; i < num_cols; i += config::warp_size) { + const auto col = i + warp.thread_rank(); + const auto pred = + col < num_cols ? is_nonzero(source[stride * row + col]) : false; + const auto mask = warp.ballot(pred); + const auto out_idx = + local_row + + (base_out_idx + popcnt(mask & lane_prefix_mask)) * slice_size; + if (pred) { + values[out_idx] = source[stride * row + col]; + col_idxs[out_idx] = col; } + base_out_idx += popcnt(mask); } - for (size_type i = sellp_ind; - i < - (slice_sets[sliceid] + slice_lengths[sliceid]) * slice_size + row; - i += slice_size) { - col_idxs[i] = 0; - vals[i] = zero(); - } - } -} - - -__global__ __launch_bounds__(default_block_size) void reduce_max_nnz( - size_type size, const size_type* __restrict__ nnz_per_row, - size_type* __restrict__ result) -{ - extern __shared__ size_type block_max[]; - - reduce_array( - size, nnz_per_row, block_max, - [](const size_type& x, const size_type& y) { return max(x, y); }); - - if (threadIdx.x == 0) { - result[blockIdx.x] = block_max[0]; - } -} - - -__global__ __launch_bounds__(default_block_size) void reduce_max_nnz_per_slice( - size_type num_rows, size_type slice_size, size_type stride_factor, - const size_type* __restrict__ nnz_per_row, size_type* __restrict__ result) -{ - constexpr auto warp_size = config::warp_size; - auto warp_tile = - group::tiled_partition(group::this_thread_block()); - const auto warpid = thread::get_subwarp_id_flat(); - const auto tid_in_warp = warp_tile.thread_rank(); - const auto slice_num = ceildiv(num_rows, slice_size); - - size_type thread_result = 0; - for (size_type i = tid_in_warp; i < slice_size; i += warp_size) { - if (warpid * slice_size + i < num_rows) { - thread_result = - max(thread_result, nnz_per_row[warpid * slice_size + i]); + for (size_type i = base_out_idx + warp.thread_rank(); i < slice_length; + i += config::warp_size) { + const auto out_idx = local_row + i * slice_size; + values[out_idx] = zero(); + col_idxs[out_idx] = 0; } } - - auto warp_result = reduce( - warp_tile, thread_result, - [](const size_type& a, const size_type& b) { return max(a, b); }); - - if (tid_in_warp == 0 && warpid < slice_num) { - result[warpid] = ceildiv(warp_result, stride_factor) * stride_factor; - } -} - - -__global__ __launch_bounds__(default_block_size) void reduce_total_cols( - size_type num_slices, const size_type* __restrict__ max_nnz_per_slice, - size_type* __restrict__ result) -{ - extern __shared__ size_type block_result[]; - - reduce_array(num_slices, max_nnz_per_slice, block_result, - [](const size_type& x, const size_type& y) { return x + y; }); - - if (threadIdx.x == 0) { - result[blockIdx.x] = block_result[0]; - } } diff --git a/common/cuda_hip/matrix/ell_kernels.hpp.inc b/common/cuda_hip/matrix/ell_kernels.hpp.inc index c8f89bade69..4849555c8ef 100644 --- a/common/cuda_hip/matrix/ell_kernels.hpp.inc +++ b/common/cuda_hip/matrix/ell_kernels.hpp.inc @@ -31,7 +31,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ namespace kernel { -namespace { template -__global__ __launch_bounds__(config::max_block_size) void initialize_zero_dense( - size_type num_rows, size_type num_cols, size_type stride, - ValueType* __restrict__ result) -{ - const auto tidx_x = threadIdx.x + blockDim.x * blockIdx.x; - const auto tidx_y = threadIdx.y + blockDim.y * blockIdx.y; - if (tidx_x < num_cols && tidx_y < num_rows) { - result[tidx_y * stride + tidx_x] = zero(); - } -} - - -template -__global__ __launch_bounds__(default_block_size) void fill_in_dense( - size_type num_rows, size_type nnz, size_type source_stride, - const IndexType* __restrict__ col_idxs, - const ValueType* __restrict__ values, size_type result_stride, - ValueType* __restrict__ result) -{ - const auto tidx = thread::get_thread_id_flat(); - if (tidx < num_rows) { - for (size_type col = 0; col < nnz; col++) { - result[tidx * result_stride + - col_idxs[tidx + col * source_stride]] += - values[tidx + col * source_stride]; - } - } -} - - -template -__global__ __launch_bounds__(default_block_size) void count_nnz_per_row( - size_type num_rows, size_type max_nnz_per_row, size_type stride, - const ValueType* __restrict__ values, IndexType* __restrict__ result) -{ - constexpr auto warp_size = config::warp_size; - const auto row_idx = thread::get_subwarp_id_flat(); - auto warp_tile = - group::tiled_partition(group::this_thread_block()); - - if (row_idx < num_rows) { - IndexType part_result{}; - for (auto i = warp_tile.thread_rank(); i < max_nnz_per_row; - i += warp_size) { - if (values[stride * i + row_idx] != zero()) { - part_result += 1; - } - } - result[row_idx] = reduce( - warp_tile, part_result, - [](const size_type& a, const size_type& b) { return a + b; }); - } -} - - -template -__global__ __launch_bounds__(default_block_size) void fill_in_csr( - size_type num_rows, size_type max_nnz_per_row, size_type stride, - const ValueType* __restrict__ source_values, - const IndexType* __restrict__ source_col_idxs, - IndexType* __restrict__ result_row_ptrs, - IndexType* __restrict__ result_col_idxs, - ValueType* __restrict__ result_values) -{ - const auto tidx = thread::get_thread_id_flat(); - - if (tidx < num_rows) { - auto write_to = result_row_ptrs[tidx]; - for (size_type i = 0; i < max_nnz_per_row; i++) { - const auto source_idx = tidx + stride * i; - if (source_values[source_idx] != zero()) { - result_values[write_to] = source_values[source_idx]; - result_col_idxs[write_to] = source_col_idxs[source_idx]; - write_to++; - } - } - } -} - - -template -__global__ __launch_bounds__(default_block_size) void extract_diagonal( - size_type diag_size, size_type max_nnz_per_row, size_type orig_stride, - const ValueType* __restrict__ orig_values, - const IndexType* __restrict__ orig_col_idxs, ValueType* __restrict__ diag) -{ - const auto tidx = thread::get_thread_id_flat(); - const auto row = tidx % diag_size; - const auto col = tidx / diag_size; - const auto ell_ind = orig_stride * col + row; - - if (col < max_nnz_per_row) { - if (orig_col_idxs[ell_ind] == row && - orig_values[ell_ind] != zero()) { - diag[row] = orig_values[ell_ind]; - } - } -} - - } // namespace kernel diff --git a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc b/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc index afe6bdcc5a2..20846202b03 100644 --- a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc +++ b/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc @@ -72,6 +72,50 @@ __global__ __launch_bounds__(default_block_size) void transpose_blocks( } +template +__global__ __launch_bounds__(default_block_size) void convert_to_csr( + const IndexType* block_row_ptrs, const IndexType* block_col_idxs, + const ValueType* blocks, IndexType* row_ptrs, IndexType* col_idxs, + ValueType* values, size_type num_block_rows, int block_size) +{ + const auto block_row = thread::get_subwarp_id_flat(); + if (block_row >= num_block_rows) { + return; + } + const auto block_begin = block_row_ptrs[block_row]; + const auto block_end = block_row_ptrs[block_row + 1]; + const auto num_blocks = block_end - block_begin; + const auto first_row = block_row * block_size; + const auto block_row_begin = block_begin * block_size * block_size; + const auto warp = + group::tiled_partition(group::this_thread_block()); + const auto lane = warp.thread_rank(); + if (block_row == 0 && lane == 0) { + row_ptrs[0] = 0; + } + if (lane < block_size) { + row_ptrs[first_row + lane + 1] = + block_row_begin + num_blocks * block_size * (lane + 1); + } + for (IndexType i = lane; i < num_blocks * block_size * block_size; + i += config::warp_size) { + const auto local_row = i / (num_blocks * block_size); + const auto local_block = (i % (num_blocks * block_size)) / block_size; + const auto local_col = i % block_size; + const auto block_idx = block_col_idxs[block_begin + local_block]; + // first nz of the row block + all prev rows + all prev blocks in row + + // all previous cols in this block + const auto out_idx = block_row_begin + + num_blocks * block_size * local_row + + local_block * block_size + local_col; + col_idxs[out_idx] = block_idx * block_size + local_col; + values[out_idx] = + blocks[(block_begin + local_block) * block_size * block_size + + local_col * block_size + local_row]; + } +} + + } // namespace kernel diff --git a/common/cuda_hip/matrix/hybrid_kernels.hpp.inc b/common/cuda_hip/matrix/hybrid_kernels.hpp.inc index e72ecac289a..04d52f6b0dc 100644 --- a/common/cuda_hip/matrix/hybrid_kernels.hpp.inc +++ b/common/cuda_hip/matrix/hybrid_kernels.hpp.inc @@ -30,116 +30,8 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -namespace kernel { - -/** - * The global function for counting the number of nonzeros per row of COO. - * It is almost like COO spmv routine. - * It performs is_nonzeros(Coo) times the vector whose values are one - * - * @param nnz the number of nonzeros in the matrix - * @param num_line the maximum round of each warp - * @param val the value array of the matrix - * @param row the row index array of the matrix - * @param nnz_per_row the output nonzeros per row - */ -template -__global__ __launch_bounds__(default_block_size) void count_coo_row_nnz( - const size_type nnz, const size_type num_lines, - const ValueType* __restrict__ val, const IndexType* __restrict__ row, - IndexType* __restrict__ nnz_per_row) -{ - IndexType temp_val = 0; - const auto start = static_cast(blockDim.x) * blockIdx.x * - blockDim.y * num_lines + - threadIdx.y * blockDim.x * num_lines; - size_type num = (nnz > start) * ceildiv(nnz - start, subwarp_size); - num = min(num, num_lines); - const IndexType ind_start = start + threadIdx.x; - const IndexType ind_end = ind_start + (num - 1) * subwarp_size; - IndexType ind = ind_start; - IndexType curr_row = (ind < nnz) ? row[ind] : 0; - const auto tile_block = - group::tiled_partition(group::this_thread_block()); - for (; ind < ind_end; ind += subwarp_size) { - temp_val += ind < nnz && val[ind] != zero(); - auto next_row = - (ind + subwarp_size < nnz) ? row[ind + subwarp_size] : row[nnz - 1]; - // segmented scan - if (tile_block.any(curr_row != next_row)) { - bool is_first_in_segment = - segment_scan(tile_block, curr_row, &temp_val); - if (is_first_in_segment) { - atomic_add(&(nnz_per_row[curr_row]), temp_val); - } - temp_val = 0; - } - curr_row = next_row; - } - if (num > 0) { - ind = ind_end; - temp_val += ind < nnz && val[ind] != zero(); - // segmented scan - - bool is_first_in_segment = - segment_scan(tile_block, curr_row, &temp_val); - if (is_first_in_segment) { - atomic_add(&(nnz_per_row[curr_row]), temp_val); - } - } -} - - -template -__global__ __launch_bounds__(default_block_size) void fill_in_csr( - size_type num_rows, size_type max_nnz_per_row, size_type stride, - const ValueType* __restrict__ ell_val, - const IndexType* __restrict__ ell_col, - const ValueType* __restrict__ coo_val, - const IndexType* __restrict__ coo_col, - const IndexType* __restrict__ coo_offset, - IndexType* __restrict__ result_row_ptrs, - IndexType* __restrict__ result_col_idxs, - ValueType* __restrict__ result_values) -{ - const auto tidx = thread::get_thread_id_flat(); - - if (tidx < num_rows) { - auto write_to = result_row_ptrs[tidx]; - for (size_type i = 0; i < max_nnz_per_row; i++) { - const auto source_idx = tidx + stride * i; - if (ell_val[source_idx] != zero()) { - result_values[write_to] = ell_val[source_idx]; - result_col_idxs[write_to] = ell_col[source_idx]; - write_to++; - } - } - for (auto i = coo_offset[tidx]; i < coo_offset[tidx + 1]; i++) { - if (coo_val[i] != zero()) { - result_values[write_to] = coo_val[i]; - result_col_idxs[write_to] = coo_col[i]; - write_to++; - } - } - } -} - - -template -__global__ __launch_bounds__(default_block_size) void add( - size_type num, ValueType1* __restrict__ val1, - const ValueType2* __restrict__ val2) -{ - const auto tidx = thread::get_thread_id_flat(); - if (tidx < num) { - val1[tidx] += val2[tidx]; - } -} - - -} // namespace kernel +namespace { template @@ -153,6 +45,9 @@ struct hybrid_tuple_unpack_functor { }; +} // anonymous namespace + + template void split_matrix_data( std::shared_ptr exec, diff --git a/common/cuda_hip/matrix/sellp_kernels.hpp.inc b/common/cuda_hip/matrix/sellp_kernels.hpp.inc index 672806ffe8e..49076960a32 100644 --- a/common/cuda_hip/matrix/sellp_kernels.hpp.inc +++ b/common/cuda_hip/matrix/sellp_kernels.hpp.inc @@ -30,8 +30,6 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -namespace { - template __global__ __launch_bounds__(matrix::default_slice_size) void spmv_kernel( @@ -86,148 +84,3 @@ __global__ beta[0] * c[global_row * c_stride + column_id] + val; } } - - -} // namespace - - -namespace kernel { - - -template -__global__ __launch_bounds__(config::max_block_size) void initialize_zero_dense( - size_type num_rows, size_type num_cols, size_type stride, - ValueType* __restrict__ result) -{ - const auto tidx_x = threadIdx.x + blockDim.x * blockIdx.x; - const auto tidx_y = threadIdx.y + blockDim.y * blockIdx.y; - if (tidx_x < num_cols && tidx_y < num_rows) { - result[tidx_y * stride + tidx_x] = zero(); - } -} - - -template -__global__ __launch_bounds__(default_block_size) void fill_in_dense( - size_type num_rows, size_type num_cols, size_type stride, - size_type slice_size, const size_type* __restrict__ slice_lengths, - const size_type* __restrict__ slice_sets, - const IndexType* __restrict__ col_idxs, - const ValueType* __restrict__ values, ValueType* __restrict__ result) -{ - const auto global_row = thread::get_subwarp_id_flat(); - const auto row = global_row % slice_size; - const auto slice = global_row / slice_size; - const auto start_index = threadIdx.x % threads_per_row; - - if (global_row < num_rows) { - for (auto i = start_index; i < slice_lengths[slice]; - i += threads_per_row) { - if (values[(slice_sets[slice] + i) * slice_size + row] != - zero()) { - result[global_row * stride + - col_idxs[(slice_sets[slice] + i) * slice_size + row]] = - values[(slice_sets[slice] + i) * slice_size + row]; - } - } - } -} - - -template -__global__ __launch_bounds__(default_block_size) void count_nnz_per_row( - size_type num_rows, size_type slice_size, - const size_type* __restrict__ slice_sets, - const ValueType* __restrict__ values, IndexType* __restrict__ result) -{ - constexpr auto warp_size = config::warp_size; - auto warp_tile = - group::tiled_partition(group::this_thread_block()); - const auto row_idx = thread::get_subwarp_id_flat(); - const auto slice_id = row_idx / slice_size; - const auto tid_in_warp = warp_tile.thread_rank(); - const auto row_in_slice = row_idx % slice_size; - - if (row_idx < num_rows) { - IndexType part_result{}; - for (size_type sellp_ind = - (slice_sets[slice_id] + tid_in_warp) * slice_size + - row_in_slice; - sellp_ind < slice_sets[slice_id + 1] * slice_size; - sellp_ind += warp_size * slice_size) { - if (values[sellp_ind] != zero()) { - part_result += 1; - } - } - result[row_idx] = reduce( - warp_tile, part_result, - [](const size_type& a, const size_type& b) { return a + b; }); - } -} - - -template -__global__ __launch_bounds__(default_block_size) void fill_in_csr( - size_type num_rows, size_type slice_size, - const size_type* __restrict__ source_slice_sets, - const IndexType* __restrict__ source_col_idxs, - const ValueType* __restrict__ source_values, - IndexType* __restrict__ result_row_ptrs, - IndexType* __restrict__ result_col_idxs, - ValueType* __restrict__ result_values) -{ - const auto row = thread::get_thread_id_flat(); - const auto slice_id = row / slice_size; - const auto row_in_slice = row % slice_size; - - if (row < num_rows) { - size_type csr_ind = result_row_ptrs[row]; - for (size_type sellp_ind = - source_slice_sets[slice_id] * slice_size + row_in_slice; - sellp_ind < source_slice_sets[slice_id + 1] * slice_size; - sellp_ind += slice_size) { - if (source_values[sellp_ind] != zero()) { - result_values[csr_ind] = source_values[sellp_ind]; - result_col_idxs[csr_ind] = source_col_idxs[sellp_ind]; - csr_ind++; - } - } - } -} - - -template -__global__ __launch_bounds__(default_block_size) void extract_diagonal( - size_type diag_size, size_type slice_size, - const size_type* __restrict__ orig_slice_sets, - const ValueType* __restrict__ orig_values, - const IndexType* __restrict__ orig_col_idxs, ValueType* __restrict__ diag) -{ - constexpr auto warp_size = config::warp_size; - auto warp_tile = - group::tiled_partition(group::this_thread_block()); - const auto slice_id = thread::get_subwarp_id_flat(); - const auto tid_in_warp = warp_tile.thread_rank(); - const auto slice_num = ceildiv(diag_size, slice_size); - - if (slice_id >= slice_num) { - return; - } - - const auto start_ind = orig_slice_sets[slice_id] * slice_size + tid_in_warp; - const auto end_ind = orig_slice_sets[slice_id + 1] * slice_size; - - for (auto sellp_ind = start_ind; sellp_ind < end_ind; - sellp_ind += warp_size) { - auto global_row = slice_id * slice_size + sellp_ind % slice_size; - if (global_row < diag_size) { - if (orig_col_idxs[sellp_ind] == global_row && - orig_values[sellp_ind] != zero()) { - diag[global_row] = orig_values[sellp_ind]; - } - } - } -} - - -} // namespace kernel diff --git a/common/unified/matrix/csr_kernels.cpp b/common/unified/matrix/csr_kernels.cpp index 65da87cf018..5b571dea4ae 100644 --- a/common/unified/matrix/csr_kernels.cpp +++ b/common/unified/matrix/csr_kernels.cpp @@ -152,6 +152,131 @@ void inv_scale(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_SCALE_KERNEL); +template +void count_nonzeros_per_row(std::shared_ptr exec, + const matrix::Csr* source, + size_type* result) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto row_ptrs, auto nnz) { + nnz[row] = row_ptrs[row + 1] - row_ptrs[row]; + }, + source->get_size()[0], source->get_const_row_ptrs(), result); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_COUNT_NONZEROS_PER_ROW_KERNEL); + + +template +void convert_to_sellp(std::shared_ptr exec, + const matrix::Csr* matrix, + matrix::Sellp* output) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto in_cols, auto in_values, auto row_ptrs, + auto slice_size, auto slice_sets, auto cols, + auto values) { + const auto row_begin = row_ptrs[row]; + const auto row_end = row_ptrs[row + 1]; + const auto slice = row / slice_size; + const auto local_row = row % slice_size; + const auto slice_begin = slice_sets[slice]; + const auto slice_end = slice_sets[slice + 1]; + const auto slice_length = slice_end - slice_begin; + auto out_idx = slice_begin * slice_size + local_row; + for (auto i = row_begin; i < row_begin + slice_length; i++) { + cols[out_idx] = i < row_end ? in_cols[i] : 0; + values[out_idx] = i < row_end ? unpack_member(in_values[i]) + : zero(values[out_idx]); + out_idx += slice_size; + } + }, + output->get_size()[0], matrix->get_const_col_idxs(), + matrix->get_const_values(), matrix->get_const_row_ptrs(), + output->get_slice_size(), output->get_const_slice_sets(), + output->get_col_idxs(), output->get_values()); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_CONVERT_TO_SELLP_KERNEL); + + +template +void convert_to_ell(std::shared_ptr exec, + const matrix::Csr* matrix, + matrix::Ell* output) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto in_cols, auto in_values, auto row_ptrs, + auto ell_length, auto ell_stride, auto cols, + auto values) { + const auto row_begin = row_ptrs[row]; + const auto row_end = row_ptrs[row + 1]; + auto out_idx = row; + for (auto i = row_begin; i < row_begin + ell_length; i++) { + cols[out_idx] = i < row_end ? in_cols[i] : 0; + values[out_idx] = i < row_end ? unpack_member(in_values[i]) + : zero(values[out_idx]); + out_idx += ell_stride; + } + }, + output->get_size()[0], matrix->get_const_col_idxs(), + matrix->get_const_values(), matrix->get_const_row_ptrs(), + output->get_num_stored_elements_per_row(), output->get_stride(), + output->get_col_idxs(), output->get_values()); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_CONVERT_TO_ELL_KERNEL); + + +template +void convert_to_hybrid(std::shared_ptr exec, + const matrix::Csr* source, + const int64* coo_row_ptrs, + matrix::Hybrid* result) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto row_ptrs, auto cols, auto vals, + auto ell_stride, auto ell_max_nnz, auto ell_cols, + auto ell_vals, auto coo_row_ptrs, auto coo_row_idxs, + auto coo_col_idxs, auto coo_vals) { + const auto row_begin = row_ptrs[row]; + const auto row_size = row_ptrs[row + 1] - row_begin; + for (int64 i = 0; i < ell_max_nnz; i++) { + const auto out_idx = row + ell_stride * i; + const auto in_idx = i + row_begin; + const bool use = i < row_size; + ell_cols[out_idx] = use ? cols[in_idx] : 0; + ell_vals[out_idx] = use ? vals[in_idx] : zero(vals[in_idx]); + } + const auto coo_begin = coo_row_ptrs[row]; + for (int64 i = ell_max_nnz; i < row_size; i++) { + const auto in_idx = i + row_begin; + const auto out_idx = + coo_begin + i - static_cast(ell_max_nnz); + coo_row_idxs[out_idx] = row; + coo_col_idxs[out_idx] = cols[in_idx]; + coo_vals[out_idx] = vals[in_idx]; + } + }, + source->get_size()[0], source->get_const_row_ptrs(), + source->get_const_col_idxs(), source->get_const_values(), + result->get_ell_stride(), result->get_ell_num_stored_elements_per_row(), + result->get_ell_col_idxs(), result->get_ell_values(), coo_row_ptrs, + result->get_coo_row_idxs(), result->get_coo_col_idxs(), + result->get_coo_values()); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL); + + } // namespace csr } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels diff --git a/common/unified/matrix/dense_kernels.cpp b/common/unified/matrix/dense_kernels.cpp index ef1690c4af9..eec8577ef7a 100644 --- a/common/unified/matrix/dense_kernels.cpp +++ b/common/unified/matrix/dense_kernels.cpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common/unified/base/kernel_launch.hpp" #include "common/unified/base/kernel_launch_reduction.hpp" +#include "core/components/prefix_sum_kernels.hpp" namespace gko { @@ -308,6 +309,81 @@ void compute_norm1(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL); +template +void compute_max_nnz_per_row(std::shared_ptr exec, + const matrix::Dense* source, + size_type& result) +{ + Array partial{exec, source->get_size()[0] + 1}; + count_nonzeros_per_row(exec, source, partial.get_data()); + run_kernel_reduction( + exec, [] GKO_KERNEL(auto i, auto partial) { return partial[i]; }, + [] GKO_KERNEL(auto a, auto b) { return a > b ? a : b; }, + [] GKO_KERNEL(auto a) { return a; }, size_type{}, + partial.get_data() + source->get_size()[0], source->get_size()[0], + partial); + result = exec->copy_val_to_host(partial.get_const_data() + + source->get_size()[0]); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COMPUTE_MAX_NNZ_PER_ROW_KERNEL); + + +template +void compute_slice_sets(std::shared_ptr exec, + const matrix::Dense* source, + size_type slice_size, size_type stride_factor, + size_type* slice_sets, size_type* slice_lengths) +{ + const auto num_rows = source->get_size()[0]; + Array row_nnz{exec, num_rows}; + count_nonzeros_per_row(exec, source, row_nnz.get_data()); + const auto num_slices = + static_cast(ceildiv(num_rows, slice_size)); + run_kernel_row_reduction( + exec, + [] GKO_KERNEL(auto slice, auto local_row, auto row_nnz, auto slice_size, + auto stride_factor, auto num_rows) { + const auto row = slice * slice_size + local_row; + return row < num_rows ? static_cast( + ceildiv(row_nnz[row], stride_factor) * + stride_factor) + : size_type{}; + }, + [] GKO_KERNEL(auto a, auto b) { return a > b ? a : b; }, + [] GKO_KERNEL(auto a) { return a; }, size_type{}, slice_lengths, 1, + gko::dim<2>{num_slices, slice_size}, row_nnz, slice_size, stride_factor, + num_rows); + exec->copy(num_slices, slice_lengths, slice_sets); + components::prefix_sum(exec, slice_sets, num_slices + 1); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COMPUTE_SLICE_SETS_KERNEL); + + +template +void count_nonzeros_per_row(std::shared_ptr exec, + const matrix::Dense* mtx, + IndexType* result) +{ + run_kernel_row_reduction( + exec, + [] GKO_KERNEL(auto i, auto j, auto mtx) { + return is_nonzero(mtx(i, j)) ? 1 : 0; + }, + [] GKO_KERNEL(auto a, auto b) { return a + b; }, + [] GKO_KERNEL(auto a) { return a; }, IndexType{}, result, 1, + mtx->get_size(), mtx); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL_SIZE_T); + + template void symm_permute(std::shared_ptr exec, const Array* permutation_indices, diff --git a/common/unified/matrix/ell_kernels.cpp b/common/unified/matrix/ell_kernels.cpp index 6945fcc4d49..1c0692f53b0 100644 --- a/common/unified/matrix/ell_kernels.cpp +++ b/common/unified/matrix/ell_kernels.cpp @@ -180,6 +180,32 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_ELL_COUNT_NONZEROS_PER_ROW_KERNEL); +template +void extract_diagonal(std::shared_ptr exec, + const matrix::Ell* orig, + matrix::Diagonal* diag) +{ + // ELL is stored in column-major, so we swap row and column parameters + run_kernel( + exec, + [] GKO_KERNEL(auto ell_col, auto row, auto ell_stride, auto in_cols, + auto in_vals, auto out_vals) { + const auto ell_idx = ell_col * ell_stride + row; + const auto col = in_cols[ell_idx]; + const auto val = in_vals[ell_idx]; + if (row == col && is_nonzero(val)) { + out_vals[row] = val; + } + }, + dim<2>{orig->get_num_stored_elements_per_row(), orig->get_size()[0]}, + static_cast(orig->get_stride()), orig->get_const_col_idxs(), + orig->get_const_values(), diag->get_values()); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_ELL_EXTRACT_DIAGONAL_KERNEL); + + } // namespace ell } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels diff --git a/common/unified/matrix/hybrid_kernels.cpp b/common/unified/matrix/hybrid_kernels.cpp index 7bc27b04b6b..8f654e7d157 100644 --- a/common/unified/matrix/hybrid_kernels.cpp +++ b/common/unified/matrix/hybrid_kernels.cpp @@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common/unified/base/kernel_launch.hpp" +#include "core/components/prefix_sum_kernels.hpp" namespace gko { @@ -47,6 +48,21 @@ namespace GKO_DEVICE_NAMESPACE { namespace hybrid { +void compute_coo_row_ptrs(std::shared_ptr exec, + const Array& row_nnz, size_type ell_lim, + int64* coo_row_ptrs) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto i, auto row_nnz, auto ell_lim, auto coo_row_ptrs) { + coo_row_ptrs[i] = max(int64{}, static_cast(row_nnz[i]) - + static_cast(ell_lim)); + }, + row_nnz.get_num_elems(), row_nnz, ell_lim, coo_row_ptrs); + components::prefix_sum(exec, coo_row_ptrs, row_nnz.get_num_elems() + 1); +} + + void compute_row_nnz(std::shared_ptr exec, const Array& row_ptrs, size_type* row_nnzs) { diff --git a/common/unified/matrix/sellp_kernels.cpp b/common/unified/matrix/sellp_kernels.cpp index 4c6419119b3..1347a746a6c 100644 --- a/common/unified/matrix/sellp_kernels.cpp +++ b/common/unified/matrix/sellp_kernels.cpp @@ -52,8 +52,9 @@ namespace GKO_DEVICE_NAMESPACE { namespace sellp { +template void compute_slice_sets(std::shared_ptr exec, - const Array& row_ptrs, size_type slice_size, + const Array& row_ptrs, size_type slice_size, size_type stride_factor, size_type* slice_sets, size_type* slice_lengths) { @@ -80,6 +81,9 @@ void compute_slice_sets(std::shared_ptr exec, components::prefix_sum(exec, slice_sets, num_slices + 1); } +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( + GKO_DECLARE_SELLP_COMPUTE_SLICE_SETS_KERNEL); + template void fill_in_matrix_data( @@ -144,6 +148,36 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_SELLP_FILL_IN_DENSE_KERNEL); +template +void count_nonzeros_per_row(std::shared_ptr exec, + const matrix::Sellp* source, + IndexType* result) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto slice_size, auto slice_sets, auto values, + auto result) { + const auto slice = row / slice_size; + const auto local_row = row % slice_size; + const auto slice_begin = slice_sets[slice]; + const auto slice_end = slice_sets[slice + 1]; + const auto slice_length = slice_end - slice_begin; + auto in_idx = slice_begin * slice_size + local_row; + IndexType row_nnz{}; + for (int64 i = 0; i < slice_length; i++) { + row_nnz += is_nonzero(values[in_idx]); + in_idx += slice_size; + } + result[row] = row_nnz; + }, + source->get_size()[0], source->get_slice_size(), + source->get_const_slice_sets(), source->get_const_values(), result); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_SELLP_COUNT_NONZEROS_PER_ROW_KERNEL); + + template void convert_to_csr(std::shared_ptr exec, const matrix::Sellp* source, @@ -178,6 +212,38 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_SELLP_CONVERT_TO_CSR_KERNEL); +template +void extract_diagonal(std::shared_ptr exec, + const matrix::Sellp* orig, + matrix::Diagonal* diag) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto slice_size, auto slice_sets, auto cols, + auto values, auto diag) { + const auto slice = row / slice_size; + const auto local_row = row % slice_size; + const auto slice_begin = slice_sets[slice]; + const auto slice_end = slice_sets[slice + 1]; + const auto slice_length = slice_end - slice_begin; + auto in_idx = slice_begin * slice_size + local_row; + for (int64 i = 0; i < slice_length; i++) { + if (row == cols[in_idx]) { + diag[row] = values[in_idx]; + break; + } + in_idx += slice_size; + } + }, + orig->get_size()[0], orig->get_slice_size(), + orig->get_const_slice_sets(), orig->get_const_col_idxs(), + orig->get_const_values(), diag->get_values()); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_SELLP_EXTRACT_DIAGONAL_KERNEL); + + } // namespace sellp } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index 358b5f67b1c..9a072fa56d7 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -263,10 +263,11 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_ELL_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_HYBRID_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_SELLP_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_SPARSITY_CSR_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COUNT_NONZEROS_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_CALCULATE_MAX_NNZ_PER_ROW_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_CALCULATE_NONZEROS_PER_ROW_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_CALCULATE_TOTAL_COLS_KERNEL); +GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_MAX_NNZ_PER_ROW_KERNEL); +GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_SLICE_SETS_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL_SIZE_T); GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL); GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_SYMM_PERMUTE_KERNEL); @@ -443,6 +444,7 @@ namespace sparsity_csr { GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SPARSITY_CSR_FILL_IN_DENSE_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE( GKO_DECLARE_SPARSITY_CSR_FILL_IN_MATRIX_DATA_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE( @@ -471,7 +473,6 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CONVERT_TO_ELL_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CONVERT_TO_SELLP_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CALCULATE_TOTAL_COLS_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_TRANSPOSE_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL); @@ -479,9 +480,7 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INVERSE_COLUMN_PERMUTE_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL); GKO_STUB_INDEX_TYPE(GKO_DECLARE_INVERT_PERMUTATION_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CALCULATE_NONZEROS_PER_ROW_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_COUNT_NONZEROS_PER_ROW_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_EXTRACT_DIAGONAL); @@ -512,10 +511,6 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_FILL_IN_DENSE_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_CONVERT_TO_CSR_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_TRANSPOSE_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_CONJ_TRANSPOSE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_FBCSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_FBCSR_CALCULATE_NONZEROS_PER_ROW_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_IS_SORTED_BY_COLUMN_INDEX); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_SORT_BY_COLUMN_INDEX); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_EXTRACT_DIAGONAL); @@ -580,6 +575,7 @@ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT3_KERNEL); namespace hybrid { +GKO_STUB(GKO_DECLARE_HYBRID_COMPUTE_COO_ROW_PTRS_KERNEL); GKO_STUB(GKO_DECLARE_HYBRID_COMPUTE_ROW_NNZ); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_HYBRID_SPLIT_MATRIX_DATA_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_HYBRID_CONVERT_TO_CSR_KERNEL); @@ -593,11 +589,11 @@ namespace sellp { GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_SPMV_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_FILL_IN_MATRIX_DATA_KERNEL); -GKO_STUB(GKO_DECLARE_SELLP_COMPUTE_SLICE_SETS); +GKO_STUB_INDEX_TYPE(GKO_DECLARE_SELLP_COMPUTE_SLICE_SETS_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_ADVANCED_SPMV_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_FILL_IN_DENSE_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_CONVERT_TO_CSR_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_COUNT_NONZEROS_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_COUNT_NONZEROS_PER_ROW_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_EXTRACT_DIAGONAL_KERNEL); diff --git a/core/matrix/coo.cpp b/core/matrix/coo.cpp index 9c7a56bb83c..4163be966c2 100644 --- a/core/matrix/coo.cpp +++ b/core/matrix/coo.cpp @@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include @@ -195,6 +196,16 @@ void Coo::move_to(Dense* result) } +template +void Coo::resize(dim<2> new_size, size_type nnz) +{ + this->set_size(new_size); + this->row_idxs_.resize_and_reset(nnz); + this->col_idxs_.resize_and_reset(nnz); + this->values_.resize_and_reset(nnz); +} + + template void Coo::read(const mat_data& data) { diff --git a/core/matrix/csr.cpp b/core/matrix/csr.cpp index 637a8b6a43d..794a0404c57 100644 --- a/core/matrix/csr.cpp +++ b/core/matrix/csr.cpp @@ -53,6 +53,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/components/format_conversion_kernels.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/csr_kernels.hpp" +#include "core/matrix/ell_kernels.hpp" +#include "core/matrix/hybrid_kernels.hpp" +#include "core/matrix/sellp_kernels.hpp" namespace gko { @@ -70,9 +73,12 @@ GKO_REGISTER_OPERATION(build_row_ptrs, components::build_row_ptrs); GKO_REGISTER_OPERATION(fill_in_matrix_data, csr::fill_in_matrix_data); GKO_REGISTER_OPERATION(convert_ptrs_to_idxs, components::convert_ptrs_to_idxs); GKO_REGISTER_OPERATION(fill_in_dense, csr::fill_in_dense); +GKO_REGISTER_OPERATION(compute_slice_sets, sellp::compute_slice_sets); GKO_REGISTER_OPERATION(convert_to_sellp, csr::convert_to_sellp); -GKO_REGISTER_OPERATION(calculate_total_cols, csr::calculate_total_cols); +GKO_REGISTER_OPERATION(compute_max_row_nnz, ell::compute_max_row_nnz); GKO_REGISTER_OPERATION(convert_to_ell, csr::convert_to_ell); +GKO_REGISTER_OPERATION(compute_hybrid_coo_row_ptrs, + hybrid::compute_coo_row_ptrs); GKO_REGISTER_OPERATION(convert_to_hybrid, csr::convert_to_hybrid); GKO_REGISTER_OPERATION(calculate_nonzeros_per_row_in_span, csr::calculate_nonzeros_per_row_in_span); @@ -84,10 +90,7 @@ GKO_REGISTER_OPERATION(row_permute, csr::row_permute); GKO_REGISTER_OPERATION(inverse_row_permute, csr::inverse_row_permute); GKO_REGISTER_OPERATION(inverse_column_permute, csr::inverse_column_permute); GKO_REGISTER_OPERATION(invert_permutation, csr::invert_permutation); -GKO_REGISTER_OPERATION(calculate_max_nnz_per_row, - csr::calculate_max_nnz_per_row); -GKO_REGISTER_OPERATION(calculate_nonzeros_per_row, - csr::calculate_nonzeros_per_row); +GKO_REGISTER_OPERATION(count_nonzeros_per_row, csr::count_nonzeros_per_row); GKO_REGISTER_OPERATION(sort_by_column_index, csr::sort_by_column_index); GKO_REGISTER_OPERATION(is_sorted_by_column_index, csr::is_sorted_by_column_index); @@ -184,13 +187,13 @@ void Csr::convert_to( Coo* result) const { auto exec = this->get_executor(); - auto tmp = Coo::create( - exec, this->get_size(), this->get_num_stored_elements()); + auto tmp = make_temporary_clone(exec, result); tmp->values_ = this->values_; tmp->col_idxs_ = this->col_idxs_; + tmp->row_idxs_.resize_and_reset(this->get_num_stored_elements()); + tmp->set_size(this->get_size()); exec->run(csr::make_convert_ptrs_to_idxs( this->get_const_row_ptrs(), this->get_size()[0], tmp->get_row_idxs())); - tmp->move_to(result); } @@ -205,10 +208,10 @@ template void Csr::convert_to(Dense* result) const { auto exec = this->get_executor(); - auto tmp = Dense::create(exec, this->get_size()); - tmp->fill(zero()); - exec->run(csr::make_fill_in_dense(this, tmp.get())); - tmp->move_to(result); + result->resize(this->get_size()); + result->fill(zero()); + exec->run(csr::make_fill_in_dense( + this, make_temporary_clone(exec, result).get())); } @@ -224,21 +227,20 @@ void Csr::convert_to( Hybrid* result) const { auto exec = this->get_executor(); - Array row_nnz(exec, this->get_size()[0]); - exec->run(csr::make_calculate_nonzeros_per_row(this, &row_nnz)); - size_type ell_lim = zero(); - size_type coo_lim = zero(); - result->get_strategy()->compute_hybrid_config(row_nnz, &ell_lim, &coo_lim); - const auto max_nnz_per_row = - std::max(result->get_ell_num_stored_elements_per_row(), ell_lim); - const auto stride = std::max(result->get_ell_stride(), this->get_size()[0]); - const auto coo_nnz = - std::max(result->get_coo_num_stored_elements(), coo_lim); - auto tmp = Hybrid::create( - exec, this->get_size(), max_nnz_per_row, stride, coo_nnz, - result->get_strategy()); - exec->run(csr::make_convert_to_hybrid(this, tmp.get())); - tmp->move_to(result); + Array row_nnz{exec, this->get_size()[0]}; + Array coo_row_ptrs{exec, this->get_size()[0] + 1}; + exec->run(csr::make_count_nonzeros_per_row(this, row_nnz.get_data())); + size_type ell_lim{}; + size_type coo_nnz{}; + result->get_strategy()->compute_hybrid_config(row_nnz, &ell_lim, &coo_nnz); + exec->run(csr::make_compute_hybrid_coo_row_ptrs(row_nnz, ell_lim, + coo_row_ptrs.get_data())); + auto tmp = make_temporary_clone(exec, result); + tmp->ell_->resize(this->get_size(), ell_lim); + tmp->coo_->resize(this->get_size(), coo_nnz); + tmp->set_size(this->get_size()); + exec->run(csr::make_convert_to_hybrid(this, coo_row_ptrs.get_const_data(), + tmp.get())); } @@ -260,13 +262,22 @@ void Csr::convert_to( const auto slice_size = (result->get_slice_size() == 0) ? default_slice_size : result->get_slice_size(); - size_type total_cols = 0; - exec->run(csr::make_calculate_total_cols(this, &total_cols, stride_factor, - slice_size)); - auto tmp = Sellp::create( - exec, this->get_size(), slice_size, stride_factor, total_cols); + const auto num_rows = this->get_size()[0]; + const auto num_slices = ceildiv(num_rows, slice_size); + auto tmp = make_temporary_clone(exec, result); + tmp->slice_sets_.resize_and_reset(num_slices + 1); + tmp->slice_lengths_.resize_and_reset(num_slices); + tmp->stride_factor_ = stride_factor; + tmp->slice_size_ = slice_size; + exec->run(csr::make_compute_slice_sets(this->row_ptrs_, slice_size, + stride_factor, tmp->get_slice_sets(), + tmp->get_slice_lengths())); + auto total_cols = + exec->copy_val_to_host(tmp->get_slice_sets() + num_slices); + tmp->col_idxs_.resize_and_reset(total_cols * slice_size); + tmp->values_.resize_and_reset(total_cols * slice_size); + tmp->set_size(this->get_size()); exec->run(csr::make_convert_to_sellp(this, tmp.get())); - tmp->move_to(result); } @@ -281,17 +292,13 @@ template void Csr::convert_to( SparsityCsr* result) const { - auto exec = this->get_executor(); - auto tmp = SparsityCsr::create( - exec, this->get_size(), this->get_num_stored_elements()); - tmp->col_idxs_ = this->col_idxs_; - tmp->row_ptrs_ = this->row_ptrs_; - if (result->value_.get_data()) { - tmp->value_ = result->value_; - } else { - tmp->value_ = gko::Array(exec, {one()}); + result->col_idxs_ = this->col_idxs_; + result->row_ptrs_ = this->row_ptrs_; + if (!result->value_.get_data()) { + result->value_ = + gko::Array(result->get_executor(), {one()}); } - tmp->move_to(result); + result->set_size(this->get_size()); } @@ -308,12 +315,19 @@ void Csr::convert_to( Ell* result) const { auto exec = this->get_executor(); - size_type max_nnz_per_row; - exec->run(csr::make_calculate_max_nnz_per_row(this, &max_nnz_per_row)); - auto tmp = Ell::create(exec, this->get_size(), - max_nnz_per_row); + size_type max_nnz_per_row{}; + exec->run(csr::make_compute_max_row_nnz(this->row_ptrs_, max_nnz_per_row)); + auto tmp = make_temporary_clone(exec, result); + if (tmp->get_size() != this->get_size() || + tmp->num_stored_elements_per_row_ != max_nnz_per_row) { + tmp->num_stored_elements_per_row_ = max_nnz_per_row; + tmp->stride_ = this->get_size()[0]; + const auto storage = tmp->num_stored_elements_per_row_ * tmp->stride_; + tmp->col_idxs_.resize_and_reset(storage); + tmp->values_.resize_and_reset(storage); + tmp->set_size(this->get_size()); + } exec->run(csr::make_convert_to_ell(this, tmp.get())); - tmp->move_to(result); } @@ -337,10 +351,10 @@ void Csr::read(const device_mat_data& data) { const auto nnz = data.nonzeros.get_num_elems(); auto exec = this->get_executor(); - this->set_size(data.size); this->row_ptrs_.resize_and_reset(data.size[0] + 1); this->col_idxs_.resize_and_reset(nnz); this->values_.resize_and_reset(nnz); + this->set_size(data.size); auto local_data = make_temporary_clone(exec, &data.nonzeros); exec->run(csr::make_build_row_ptrs(*local_data, data.size[0], this->get_row_ptrs())); @@ -368,16 +382,6 @@ void Csr::write(mat_data& data) const } -template -void Csr::resize(gko::dim<2> new_size, size_type nnz) -{ - this->set_size(new_size); - this->row_ptrs_.resize_and_reset(new_size[0] + 1); - this->col_idxs_.resize_and_reset(nnz); - this->values_.resize_and_reset(nnz); -} - - template std::unique_ptr Csr::transpose() const { diff --git a/core/matrix/csr_kernels.hpp b/core/matrix/csr_kernels.hpp index 85334922a6b..15bab04223a 100644 --- a/core/matrix/csr_kernels.hpp +++ b/core/matrix/csr_kernels.hpp @@ -110,6 +110,7 @@ namespace kernels { #define GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL(ValueType, IndexType) \ void convert_to_hybrid(std::shared_ptr exec, \ const matrix::Csr* source, \ + const int64* coo_row_ptrs, \ matrix::Hybrid* result) #define GKO_DECLARE_CSR_CONVERT_TO_SELLP_KERNEL(ValueType, IndexType) \ @@ -117,12 +118,6 @@ namespace kernels { const matrix::Csr* source, \ matrix::Sellp* result) -#define GKO_DECLARE_CSR_CALCULATE_TOTAL_COLS_KERNEL(ValueType, IndexType) \ - void calculate_total_cols(std::shared_ptr exec, \ - const matrix::Csr* source, \ - size_type* result, size_type stride_factor, \ - size_type slice_size) - #define GKO_DECLARE_CSR_TRANSPOSE_KERNEL(ValueType, IndexType) \ void transpose(std::shared_ptr exec, \ const matrix::Csr* orig, \ @@ -163,18 +158,11 @@ namespace kernels { std::shared_ptr exec, size_type size, \ const IndexType* permutation_indices, IndexType* inv_permutation) -#define GKO_DECLARE_CSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL(ValueType, IndexType) \ - void calculate_max_nnz_per_row( \ - std::shared_ptr exec, \ +#define GKO_DECLARE_CSR_COUNT_NONZEROS_PER_ROW_KERNEL(ValueType, IndexType) \ + void count_nonzeros_per_row( \ + std::shared_ptr exec, \ const matrix::Csr* source, size_type* result) -#define GKO_DECLARE_CSR_CALCULATE_NONZEROS_PER_ROW_KERNEL(ValueType, \ - IndexType) \ - void calculate_nonzeros_per_row( \ - std::shared_ptr exec, \ - const matrix::Csr* source, \ - Array* result) - #define GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL(ValueType, IndexType) \ void calculate_nonzeros_per_row_in_span( \ std::shared_ptr exec, \ @@ -211,60 +199,57 @@ namespace kernels { const matrix::Dense* alpha, \ matrix::Csr* to_scale) -#define GKO_DECLARE_ALL_AS_TEMPLATES \ - template \ - GKO_DECLARE_CSR_SPMV_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_CSR_SPGEMM_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_CSR_SPGEAM_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_CSR_FILL_IN_MATRIX_DATA_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_CSR_CONVERT_TO_SELLP_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_CSR_CONVERT_TO_ELL_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_CSR_CALCULATE_TOTAL_COLS_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_CSR_TRANSPOSE_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_CSR_INVERSE_COLUMN_PERMUTE_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_INVERT_PERMUTATION_KERNEL(IndexType); \ - template \ - GKO_DECLARE_CSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_CSR_CALCULATE_NONZEROS_PER_ROW_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX(ValueType, IndexType); \ - template \ - GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX(ValueType, IndexType); \ - template \ - GKO_DECLARE_CSR_EXTRACT_DIAGONAL(ValueType, IndexType); \ - template \ - GKO_DECLARE_CSR_SCALE_KERNEL(ValueType, IndexType); \ - template \ + +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + template \ + GKO_DECLARE_CSR_SPMV_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_SPGEMM_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_SPGEAM_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_FILL_IN_MATRIX_DATA_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_CONVERT_TO_SELLP_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_CONVERT_TO_ELL_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_TRANSPOSE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_INVERSE_COLUMN_PERMUTE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_INVERT_PERMUTATION_KERNEL(IndexType); \ + template \ + GKO_DECLARE_CSR_COUNT_NONZEROS_PER_ROW_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_EXTRACT_DIAGONAL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_SCALE_KERNEL(ValueType, IndexType); \ + template \ GKO_DECLARE_CSR_INV_SCALE_KERNEL(ValueType, IndexType) diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp index be09de48dd3..54319e63f57 100644 --- a/core/matrix/dense.cpp +++ b/core/matrix/dense.cpp @@ -53,7 +53,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/dense_kernels.hpp" +#include "core/matrix/hybrid_kernels.hpp" namespace gko { @@ -76,12 +78,12 @@ GKO_REGISTER_OPERATION(compute_dot, dense::compute_dot); GKO_REGISTER_OPERATION(compute_conj_dot, dense::compute_conj_dot); GKO_REGISTER_OPERATION(compute_norm2, dense::compute_norm2); GKO_REGISTER_OPERATION(compute_norm1, dense::compute_norm1); -GKO_REGISTER_OPERATION(count_nonzeros, dense::count_nonzeros); -GKO_REGISTER_OPERATION(calculate_max_nnz_per_row, - dense::calculate_max_nnz_per_row); -GKO_REGISTER_OPERATION(calculate_nonzeros_per_row, - dense::calculate_nonzeros_per_row); -GKO_REGISTER_OPERATION(calculate_total_cols, dense::calculate_total_cols); +GKO_REGISTER_OPERATION(compute_max_nnz_per_row, dense::compute_max_nnz_per_row); +GKO_REGISTER_OPERATION(compute_hybrid_coo_row_ptrs, + hybrid::compute_coo_row_ptrs); +GKO_REGISTER_OPERATION(count_nonzeros_per_row, dense::count_nonzeros_per_row); +GKO_REGISTER_OPERATION(prefix_sum, components::prefix_sum); +GKO_REGISTER_OPERATION(compute_slice_sets, dense::compute_slice_sets); GKO_REGISTER_OPERATION(transpose, dense::transpose); GKO_REGISTER_OPERATION(conj_transpose, dense::conj_transpose); GKO_REGISTER_OPERATION(symm_permute, dense::symm_permute); @@ -109,126 +111,6 @@ GKO_REGISTER_OPERATION(get_imag, dense::get_imag); } // namespace dense -namespace { - - -template -inline void conversion_helper(Coo* result, - MatrixType* source, const OperationType& op) -{ - auto exec = source->get_executor(); - - size_type num_stored_nonzeros = 0; - exec->run(dense::make_count_nonzeros(source, &num_stored_nonzeros)); - auto tmp = Coo::create(exec, source->get_size(), - num_stored_nonzeros); - exec->run(op(source, tmp.get())); - tmp->move_to(result); -} - - -template -inline void conversion_helper(Csr* result, - MatrixType* source, const OperationType& op) -{ - auto exec = source->get_executor(); - - size_type num_stored_nonzeros = 0; - exec->run(dense::make_count_nonzeros(source, &num_stored_nonzeros)); - auto tmp = Csr::create( - exec, source->get_size(), num_stored_nonzeros, result->get_strategy()); - exec->run(op(source, tmp.get())); - tmp->move_to(result); -} - - -template -inline void conversion_helper(Ell* result, - MatrixType* source, const OperationType& op) -{ - auto exec = source->get_executor(); - size_type num_stored_elements_per_row = 0; - exec->run(dense::make_calculate_max_nnz_per_row( - source, &num_stored_elements_per_row)); - const auto max_nnz_per_row = std::max( - result->get_num_stored_elements_per_row(), num_stored_elements_per_row); - const auto stride = std::max(result->get_stride(), source->get_size()[0]); - auto tmp = Ell::create(exec, source->get_size(), - max_nnz_per_row, stride); - exec->run(op(source, tmp.get())); - tmp->move_to(result); -} - - -template -inline void conversion_helper(Hybrid* result, - MatrixType* source, const OperationType& op) -{ - auto exec = source->get_executor(); - Array row_nnz(exec, source->get_size()[0]); - exec->run(dense::make_calculate_nonzeros_per_row(source, &row_nnz)); - size_type ell_lim = zero(); - size_type coo_lim = zero(); - result->get_strategy()->compute_hybrid_config(row_nnz, &ell_lim, &coo_lim); - const auto max_nnz_per_row = - std::max(result->get_ell_num_stored_elements_per_row(), ell_lim); - const auto stride = - std::max(result->get_ell_stride(), source->get_size()[0]); - const auto coo_nnz = - std::max(result->get_coo_num_stored_elements(), coo_lim); - auto tmp = Hybrid::create( - exec, source->get_size(), max_nnz_per_row, stride, coo_nnz, - result->get_strategy()); - exec->run(op(source, tmp.get())); - tmp->move_to(result); -} - - -template -inline void conversion_helper(Sellp* result, - MatrixType* source, const OperationType& op) -{ - auto exec = source->get_executor(); - const auto stride_factor = (result->get_stride_factor() == 0) - ? default_stride_factor - : result->get_stride_factor(); - const auto slice_size = (result->get_slice_size() == 0) - ? default_slice_size - : result->get_slice_size(); - size_type total_cols = 0; - exec->run(dense::make_calculate_total_cols(source, &total_cols, - stride_factor, slice_size)); - auto tmp = Sellp::create( - exec, source->get_size(), slice_size, stride_factor, total_cols); - exec->run(op(source, tmp.get())); - tmp->move_to(result); -} - - -template -inline void conversion_helper(SparsityCsr* result, - MatrixType* source, const OperationType& op) -{ - auto exec = source->get_executor(); - - size_type num_stored_nonzeros = 0; - exec->run(dense::make_count_nonzeros(source, &num_stored_nonzeros)); - auto tmp = SparsityCsr::create( - exec, source->get_size(), num_stored_nonzeros); - exec->run(op(source, tmp.get())); - tmp->move_to(result); -} - - -} // namespace - - template void Dense::apply_impl(const LinOp* b, LinOp* x) const { @@ -476,13 +358,29 @@ void Dense::move_to(Dense>* result) } +template +template +void Dense::convert_impl(Coo* result) const +{ + auto exec = this->get_executor(); + const auto num_rows = this->get_size()[0]; + + Array row_ptrs{exec, num_rows + 1}; + exec->run(dense::make_count_nonzeros_per_row(this, row_ptrs.get_data())); + exec->run(dense::make_prefix_sum(row_ptrs.get_data(), num_rows + 1)); + const auto nnz = + exec->copy_val_to_host(row_ptrs.get_const_data() + num_rows); + result->resize(this->get_size(), nnz); + exec->run( + dense::make_convert_to_coo(this, row_ptrs.get_const_data(), + make_temporary_clone(exec, result).get())); +} + + template void Dense::convert_to(Coo* result) const { - // const ref parameters, as make_* functions take parameters by ref - conversion_helper(result, this, [](const auto& in, const auto& out) { - return dense::make_convert_to_coo(in, out); - }); + this->convert_impl(result); } @@ -496,9 +394,7 @@ void Dense::move_to(Coo* result) template void Dense::convert_to(Coo* result) const { - conversion_helper(result, this, [](const auto& in, const auto& out) { - return dense::make_convert_to_coo(in, out); - }); + this->convert_impl(result); } @@ -510,15 +406,35 @@ void Dense::move_to(Coo* result) template -void Dense::convert_to(Csr* result) const +template +void Dense::convert_impl(Csr* result) const { - conversion_helper(result, this, [](const auto& in, const auto& out) { - return dense::make_convert_to_csr(in, out); - }); + { + auto exec = this->get_executor(); + const auto num_rows = this->get_size()[0]; + auto tmp = make_temporary_clone(exec, result); + tmp->row_ptrs_.resize_and_reset(num_rows + 1); + exec->run( + dense::make_count_nonzeros_per_row(this, tmp->get_row_ptrs())); + exec->run(dense::make_prefix_sum(tmp->get_row_ptrs(), num_rows + 1)); + const auto nnz = + exec->copy_val_to_host(tmp->get_const_row_ptrs() + num_rows); + tmp->col_idxs_.resize_and_reset(nnz); + tmp->values_.resize_and_reset(nnz); + tmp->set_size(this->get_size()); + exec->run(dense::make_convert_to_csr(this, tmp.get())); + } result->make_srow(); } +template +void Dense::convert_to(Csr* result) const +{ + this->convert_impl(result); +} + + template void Dense::move_to(Csr* result) { @@ -529,10 +445,7 @@ void Dense::move_to(Csr* result) template void Dense::convert_to(Csr* result) const { - conversion_helper(result, this, [](const auto& in, const auto& out) { - return dense::make_convert_to_csr(in, out); - }); - result->make_srow(); + this->convert_impl(result); } @@ -543,12 +456,24 @@ void Dense::move_to(Csr* result) } +template +template +void Dense::convert_impl(Ell* result) const +{ + auto exec = this->get_executor(); + size_type num_stored_elements_per_row{}; + exec->run( + dense::make_compute_max_nnz_per_row(this, num_stored_elements_per_row)); + result->resize(this->get_size(), num_stored_elements_per_row); + exec->run(dense::make_convert_to_ell( + this, make_temporary_clone(exec, result).get())); +} + + template void Dense::convert_to(Ell* result) const { - conversion_helper(result, this, [](const auto& in, const auto& out) { - return dense::make_convert_to_ell(in, out); - }); + this->convert_impl(result); } @@ -562,9 +487,7 @@ void Dense::move_to(Ell* result) template void Dense::convert_to(Ell* result) const { - conversion_helper(result, this, [](const auto& in, const auto& out) { - return dense::make_convert_to_ell(in, out); - }); + this->convert_impl(result); } @@ -575,12 +498,32 @@ void Dense::move_to(Ell* result) } +template +template +void Dense::convert_impl(Hybrid* result) const +{ + auto exec = this->get_executor(); + Array row_nnz{exec, this->get_size()[0]}; + Array coo_row_ptrs{exec, this->get_size()[0] + 1}; + exec->run(dense::make_count_nonzeros_per_row(this, row_nnz.get_data())); + size_type ell_lim{}; + size_type coo_nnz{}; + result->get_strategy()->compute_hybrid_config(row_nnz, &ell_lim, &coo_nnz); + exec->run(dense::make_compute_hybrid_coo_row_ptrs(row_nnz, ell_lim, + coo_row_ptrs.get_data())); + auto tmp = make_temporary_clone(exec, result); + tmp->ell_->resize(this->get_size(), ell_lim); + tmp->coo_->resize(this->get_size(), coo_nnz); + tmp->set_size(this->get_size()); + exec->run(dense::make_convert_to_hybrid(this, coo_row_ptrs.get_const_data(), + tmp.get())); +} + + template void Dense::convert_to(Hybrid* result) const { - conversion_helper(result, this, [](const auto& in, const auto& out) { - return dense::make_convert_to_hybrid(in, out); - }); + this->convert_impl(result); } @@ -594,9 +537,7 @@ void Dense::move_to(Hybrid* result) template void Dense::convert_to(Hybrid* result) const { - conversion_helper(result, this, [](const auto& in, const auto& out) { - return dense::make_convert_to_hybrid(in, out); - }); + this->convert_impl(result); } @@ -607,12 +548,40 @@ void Dense::move_to(Hybrid* result) } +template +template +void Dense::convert_impl(Sellp* result) const +{ + auto exec = this->get_executor(); + const auto num_rows = this->get_size()[0]; + const auto stride_factor = (result->get_stride_factor() == 0) + ? default_stride_factor + : result->get_stride_factor(); + const auto slice_size = (result->get_slice_size() == 0) + ? default_slice_size + : result->get_slice_size(); + const auto num_slices = ceildiv(num_rows, slice_size); + auto tmp = make_temporary_clone(exec, result); + tmp->stride_factor_ = stride_factor; + tmp->slice_size_ = slice_size; + tmp->slice_sets_.resize_and_reset(num_slices + 1); + tmp->slice_lengths_.resize_and_reset(num_slices); + exec->run(dense::make_compute_slice_sets(this, slice_size, stride_factor, + tmp->get_slice_sets(), + tmp->get_slice_lengths())); + auto total_cols = + exec->copy_val_to_host(tmp->get_slice_sets() + num_slices); + tmp->col_idxs_.resize_and_reset(total_cols * slice_size); + tmp->values_.resize_and_reset(total_cols * slice_size); + tmp->set_size(this->get_size()); + exec->run(dense::make_convert_to_sellp(this, tmp.get())); +} + + template void Dense::convert_to(Sellp* result) const { - conversion_helper(result, this, [](const auto& in, const auto& out) { - return dense::make_convert_to_sellp(in, out); - }); + this->convert_impl(result); } @@ -626,9 +595,7 @@ void Dense::move_to(Sellp* result) template void Dense::convert_to(Sellp* result) const { - conversion_helper(result, this, [](const auto& in, const auto& out) { - return dense::make_convert_to_sellp(in, out); - }); + this->convert_impl(result); } @@ -639,12 +606,31 @@ void Dense::move_to(Sellp* result) } +template +template +void Dense::convert_impl( + SparsityCsr* result) const +{ + auto exec = this->get_executor(); + const auto num_rows = this->get_size()[0]; + auto tmp = make_temporary_clone(exec, result); + tmp->row_ptrs_.resize_and_reset(num_rows + 1); + exec->run( + dense::make_count_nonzeros_per_row(this, tmp->row_ptrs_.get_data())); + exec->run(dense::make_prefix_sum(tmp->row_ptrs_.get_data(), num_rows + 1)); + const auto nnz = + exec->copy_val_to_host(tmp->row_ptrs_.get_const_data() + num_rows); + tmp->col_idxs_.resize_and_reset(nnz); + tmp->value_.fill(one()); + tmp->set_size(this->get_size()); + exec->run(dense::make_convert_to_sparsity_csr(this, tmp.get())); +} + + template void Dense::convert_to(SparsityCsr* result) const { - conversion_helper(result, this, [](const auto& in, const auto& out) { - return dense::make_convert_to_sparsity_csr(in, out); - }); + this->convert_impl(result); } @@ -658,9 +644,7 @@ void Dense::move_to(SparsityCsr* result) template void Dense::convert_to(SparsityCsr* result) const { - conversion_helper(result, this, [](const auto& in, const auto& out) { - return dense::make_convert_to_sparsity_csr(in, out); - }); + this->convert_impl(result); } diff --git a/core/matrix/dense_kernels.hpp b/core/matrix/dense_kernels.hpp index cf6200809fe..6b37b44dc71 100644 --- a/core/matrix/dense_kernels.hpp +++ b/core/matrix/dense_kernels.hpp @@ -135,6 +135,7 @@ namespace kernels { #define GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL(_type, _prec) \ void convert_to_coo(std::shared_ptr exec, \ const matrix::Dense<_type>* source, \ + const int64* row_ptrs, \ matrix::Coo<_type, _prec>* other) #define GKO_DECLARE_DENSE_CONVERT_TO_CSR_KERNEL(_type, _prec) \ @@ -150,6 +151,7 @@ namespace kernels { #define GKO_DECLARE_DENSE_CONVERT_TO_HYBRID_KERNEL(_type, _prec) \ void convert_to_hybrid(std::shared_ptr exec, \ const matrix::Dense<_type>* source, \ + const int64* coo_row_ptrs, \ matrix::Hybrid<_type, _prec>* other) #define GKO_DECLARE_DENSE_CONVERT_TO_SELLP_KERNEL(_type, _prec) \ @@ -162,25 +164,24 @@ namespace kernels { const matrix::Dense<_type>* source, \ matrix::SparsityCsr<_type, _prec>* other) -#define GKO_DECLARE_DENSE_COUNT_NONZEROS_KERNEL(_type) \ - void count_nonzeros(std::shared_ptr exec, \ - const matrix::Dense<_type>* source, size_type* result) +#define GKO_DECLARE_DENSE_COMPUTE_MAX_NNZ_PER_ROW_KERNEL(_type) \ + void compute_max_nnz_per_row(std::shared_ptr exec, \ + const matrix::Dense<_type>* source, \ + size_type& result) -#define GKO_DECLARE_DENSE_CALCULATE_MAX_NNZ_PER_ROW_KERNEL(_type) \ - void calculate_max_nnz_per_row( \ - std::shared_ptr exec, \ - const matrix::Dense<_type>* source, size_type* result) +#define GKO_DECLARE_DENSE_COMPUTE_SLICE_SETS_KERNEL(_type) \ + void compute_slice_sets(std::shared_ptr exec, \ + const matrix::Dense<_type>* source, \ + size_type slice_size, size_type stride_factor, \ + size_type* slice_sets, size_type* slice_lengths) -#define GKO_DECLARE_DENSE_CALCULATE_NONZEROS_PER_ROW_KERNEL(_type) \ - void calculate_nonzeros_per_row( \ - std::shared_ptr exec, \ - const matrix::Dense<_type>* source, Array* result) +#define GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL(_vtype, _itype) \ + void count_nonzeros_per_row(std::shared_ptr exec, \ + const matrix::Dense<_vtype>* source, \ + _itype* result) -#define GKO_DECLARE_DENSE_CALCULATE_TOTAL_COLS_KERNEL(_type) \ - void calculate_total_cols(std::shared_ptr exec, \ - const matrix::Dense<_type>* source, \ - size_type* result, size_type stride_factor, \ - size_type slice_size) +#define GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL_SIZE_T(_type) \ + GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL(_type, ::gko::size_type) #define GKO_DECLARE_DENSE_TRANSPOSE_KERNEL(_type) \ void transpose(std::shared_ptr exec, \ @@ -303,13 +304,11 @@ namespace kernels { template \ GKO_DECLARE_DENSE_CONVERT_TO_SPARSITY_CSR_KERNEL(ValueType, IndexType); \ template \ - GKO_DECLARE_DENSE_COUNT_NONZEROS_KERNEL(ValueType); \ - template \ - GKO_DECLARE_DENSE_CALCULATE_MAX_NNZ_PER_ROW_KERNEL(ValueType); \ + GKO_DECLARE_DENSE_COMPUTE_MAX_NNZ_PER_ROW_KERNEL(ValueType); \ template \ - GKO_DECLARE_DENSE_CALCULATE_NONZEROS_PER_ROW_KERNEL(ValueType); \ - template \ - GKO_DECLARE_DENSE_CALCULATE_TOTAL_COLS_KERNEL(ValueType); \ + GKO_DECLARE_DENSE_COMPUTE_SLICE_SETS_KERNEL(ValueType); \ + template \ + GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL(ValueType, IndexType); \ template \ GKO_DECLARE_DENSE_TRANSPOSE_KERNEL(ValueType); \ template \ diff --git a/core/matrix/diagonal.cpp b/core/matrix/diagonal.cpp index 03cd801818e..2f8c8abb178 100644 --- a/core/matrix/diagonal.cpp +++ b/core/matrix/diagonal.cpp @@ -169,10 +169,15 @@ template void Diagonal::convert_to(Csr* result) const { auto exec = this->get_executor(); - auto tmp = Csr::create( - exec, this->get_size(), this->get_size()[0], result->get_strategy()); - exec->run(diagonal::make_convert_to_csr(this, tmp.get())); - tmp->move_to(result); + { + auto tmp = make_temporary_clone(exec, result); + tmp->row_ptrs_.resize_and_reset(this->get_size()[0] + 1); + tmp->col_idxs_.resize_and_reset(this->get_size()[0]); + tmp->values_.resize_and_reset(this->get_size()[0]); + tmp->set_size(this->get_size()); + exec->run(diagonal::make_convert_to_csr(this, tmp.get())); + } + result->make_srow(); } @@ -187,10 +192,15 @@ template void Diagonal::convert_to(Csr* result) const { auto exec = this->get_executor(); - auto tmp = Csr::create( - exec, this->get_size(), this->get_size()[0], result->get_strategy()); - exec->run(diagonal::make_convert_to_csr(this, tmp.get())); - tmp->move_to(result); + { + auto tmp = make_temporary_clone(exec, result); + tmp->row_ptrs_.resize_and_reset(this->get_size()[0] + 1); + tmp->col_idxs_.resize_and_reset(this->get_size()[0]); + tmp->values_.resize_and_reset(this->get_size()[0]); + tmp->set_size(this->get_size()); + exec->run(diagonal::make_convert_to_csr(this, tmp.get())); + } + result->make_srow(); } diff --git a/core/matrix/ell.cpp b/core/matrix/ell.cpp index 4282cbc0721..bc6a2721158 100644 --- a/core/matrix/ell.cpp +++ b/core/matrix/ell.cpp @@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include @@ -151,23 +152,19 @@ void Ell::convert_to( { auto exec = this->get_executor(); const auto num_rows = this->get_size()[0]; - - Array row_ptrs{exec, num_rows + 1}; - - exec->run(ell::make_count_nonzeros_per_row(this, row_ptrs.get_data())); - exec->run(ell::make_prefix_sum(row_ptrs.get_data(), num_rows + 1)); - - const auto nnz = static_cast( - exec->copy_val_to_host(row_ptrs.get_const_data() + num_rows)); - - result->row_ptrs_ = row_ptrs; - result->resize(this->get_size(), nnz); - { auto tmp = make_temporary_clone(exec, result); - tmp->row_ptrs_ = row_ptrs; - exec->run(ell::make_convert_to_csr( - this, make_temporary_clone(exec, result).get())); + tmp->row_ptrs_.resize_and_reset(num_rows + 1); + exec->run( + ell::make_count_nonzeros_per_row(this, tmp->row_ptrs_.get_data())); + exec->run( + ell::make_prefix_sum(tmp->row_ptrs_.get_data(), num_rows + 1)); + const auto nnz = static_cast( + exec->copy_val_to_host(tmp->row_ptrs_.get_const_data() + num_rows)); + tmp->col_idxs_.resize_and_reset(nnz); + tmp->values_.resize_and_reset(nnz); + tmp->set_size(this->get_size()); + exec->run(ell::make_convert_to_csr(this, tmp.get())); } result->make_srow(); } @@ -180,6 +177,20 @@ void Ell::move_to(Csr* result) } +template +void Ell::resize(dim<2> new_size, size_type max_row_nnz) +{ + if (this->get_size() != new_size || + this->get_num_stored_elements_per_row() != max_row_nnz) { + this->stride_ = new_size[0]; + values_.resize_and_reset(this->stride_ * max_row_nnz); + col_idxs_.resize_and_reset(this->stride_ * max_row_nnz); + this->num_stored_elements_per_row_ = max_row_nnz; + this->set_size(new_size); + } +} + + template void Ell::read(const device_mat_data& data) { @@ -190,14 +201,7 @@ void Ell::read(const device_mat_data& data) row_ptrs.get_data())); size_type max_nnz{}; exec->run(ell::make_compute_max_row_nnz(row_ptrs, max_nnz)); - if (this->get_size() != data.size || - this->get_num_stored_elements_per_row() != max_nnz) { - stride_ = data.size[0]; - values_.resize_and_reset(stride_ * max_nnz); - col_idxs_.resize_and_reset(stride_ * max_nnz); - num_stored_elements_per_row_ = max_nnz; - this->set_size(data.size); - } + this->resize(data.size, max_nnz); exec->run(ell::make_fill_in_matrix_data(*local_data, row_ptrs.get_const_data(), this)); } @@ -221,7 +225,7 @@ void Ell::write(mat_data& data) const for (size_type row = 0; row < tmp->get_size()[0]; ++row) { for (size_type i = 0; i < tmp->num_stored_elements_per_row_; ++i) { const auto val = tmp->val_at(row, i); - if (val != zero()) { + if (is_nonzero(val)) { const auto col = tmp->col_at(row, i); data.nonzeros.emplace_back(row, col, val); } diff --git a/core/matrix/fbcsr.cpp b/core/matrix/fbcsr.cpp index de3e0bfe32b..8d54f480036 100644 --- a/core/matrix/fbcsr.cpp +++ b/core/matrix/fbcsr.cpp @@ -71,10 +71,6 @@ GKO_REGISTER_OPERATION(convert_to_csr, fbcsr::convert_to_csr); GKO_REGISTER_OPERATION(fill_in_dense, fbcsr::fill_in_dense); GKO_REGISTER_OPERATION(transpose, fbcsr::transpose); GKO_REGISTER_OPERATION(conj_transpose, fbcsr::conj_transpose); -GKO_REGISTER_OPERATION(calculate_max_nnz_per_row, - fbcsr::calculate_max_nnz_per_row); -GKO_REGISTER_OPERATION(calculate_nonzeros_per_row, - fbcsr::calculate_nonzeros_per_row); GKO_REGISTER_OPERATION(is_sorted_by_column_index, fbcsr::is_sorted_by_column_index); GKO_REGISTER_OPERATION(sort_by_column_index, fbcsr::sort_by_column_index); @@ -178,9 +174,14 @@ void Fbcsr::convert_to( Csr* const result) const { auto exec = this->get_executor(); - result->resize(this->get_size(), this->get_num_stored_elements()); - exec->run(fbcsr::make_convert_to_csr( - this, make_temporary_clone(exec, result).get())); + { + auto tmp = make_temporary_clone(exec, result); + tmp->row_ptrs_.resize_and_reset(this->get_size()[0] + 1); + tmp->col_idxs_.resize_and_reset(this->get_num_stored_elements()); + tmp->values_.resize_and_reset(this->get_num_stored_elements()); + tmp->set_size(this->get_size()); + exec->run(fbcsr::make_convert_to_csr(this, tmp.get())); + } result->make_srow(); } @@ -197,17 +198,13 @@ template void Fbcsr::convert_to( SparsityCsr* const result) const { - auto exec = this->get_executor(); - auto tmp = SparsityCsr::create( - exec, + result->set_size( gko::dim<2>{static_cast(this->get_num_block_rows()), - static_cast(this->get_num_block_cols())}, - this->get_num_stored_blocks()); - - tmp->col_idxs_ = this->col_idxs_; - tmp->row_ptrs_ = this->row_ptrs_; - tmp->value_ = Array(exec, {one()}); - tmp->move_to(result); + static_cast(this->get_num_block_cols())}); + result->col_idxs_ = this->col_idxs_; + result->row_ptrs_ = this->row_ptrs_; + result->value_ = + Array(result->get_executor(), {one()}); } diff --git a/core/matrix/fbcsr_kernels.hpp b/core/matrix/fbcsr_kernels.hpp index fc96fe17d86..152d009ca4c 100644 --- a/core/matrix/fbcsr_kernels.hpp +++ b/core/matrix/fbcsr_kernels.hpp @@ -92,19 +92,6 @@ namespace kernels { const matrix::Fbcsr* orig, \ matrix::Fbcsr* trans) -#define GKO_DECLARE_FBCSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL(ValueType, \ - IndexType) \ - void calculate_max_nnz_per_row( \ - std::shared_ptr exec, \ - const matrix::Fbcsr* source, size_type* result) - -#define GKO_DECLARE_FBCSR_CALCULATE_NONZEROS_PER_ROW_KERNEL(ValueType, \ - IndexType) \ - void calculate_nonzeros_per_row( \ - std::shared_ptr exec, \ - const matrix::Fbcsr* source, \ - Array* result) - #define GKO_DECLARE_FBCSR_SORT_BY_COLUMN_INDEX(ValueType, IndexType) \ void sort_by_column_index(std::shared_ptr exec, \ matrix::Fbcsr* to_sort) @@ -119,30 +106,26 @@ namespace kernels { const matrix::Fbcsr* orig, \ matrix::Diagonal* diag) -#define GKO_DECLARE_ALL_AS_TEMPLATES \ - template \ - GKO_DECLARE_FBCSR_SPMV_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_FBCSR_ADVANCED_SPMV_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_FBCSR_FILL_IN_MATRIX_DATA_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_FBCSR_FILL_IN_DENSE_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_FBCSR_CONVERT_TO_CSR_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_FBCSR_TRANSPOSE_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_FBCSR_CONJ_TRANSPOSE_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_FBCSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_FBCSR_CALCULATE_NONZEROS_PER_ROW_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_FBCSR_IS_SORTED_BY_COLUMN_INDEX(ValueType, IndexType); \ - template \ - GKO_DECLARE_FBCSR_SORT_BY_COLUMN_INDEX(ValueType, IndexType); \ - template \ +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + template \ + GKO_DECLARE_FBCSR_SPMV_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_FBCSR_ADVANCED_SPMV_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_FBCSR_FILL_IN_MATRIX_DATA_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_FBCSR_FILL_IN_DENSE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_FBCSR_CONVERT_TO_CSR_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_FBCSR_TRANSPOSE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_FBCSR_CONJ_TRANSPOSE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_FBCSR_IS_SORTED_BY_COLUMN_INDEX(ValueType, IndexType); \ + template \ + GKO_DECLARE_FBCSR_SORT_BY_COLUMN_INDEX(ValueType, IndexType); \ + template \ GKO_DECLARE_FBCSR_EXTRACT_DIAGONAL(ValueType, IndexType) diff --git a/core/matrix/hybrid.cpp b/core/matrix/hybrid.cpp index b7d7e0ce539..c4152493d3b 100644 --- a/core/matrix/hybrid.cpp +++ b/core/matrix/hybrid.cpp @@ -161,27 +161,28 @@ void Hybrid::convert_to( { auto exec = this->get_executor(); const auto num_rows = this->get_size()[0]; - - Array ell_row_ptrs{exec, num_rows + 1}; - Array coo_row_ptrs{exec, num_rows + 1}; - - exec->run(hybrid::make_ell_count_nonzeros_per_row(this->get_ell(), - ell_row_ptrs.get_data())); - exec->run(hybrid::make_prefix_sum(ell_row_ptrs.get_data(), num_rows + 1)); - exec->run(hybrid::make_convert_idxs_to_ptrs( - this->get_const_coo_row_idxs(), this->get_coo_num_stored_elements(), - num_rows, coo_row_ptrs.get_data())); - - const auto nnz = static_cast( - exec->copy_val_to_host(ell_row_ptrs.get_const_data() + num_rows) + - exec->copy_val_to_host(coo_row_ptrs.get_const_data() + num_rows)); - - result->resize(this->get_size(), nnz); - - exec->run(hybrid::make_convert_to_csr( - this, ell_row_ptrs.get_const_data(), coo_row_ptrs.get_const_data(), - make_temporary_clone(exec, result).get())); - + { + auto tmp = make_temporary_clone(exec, result); + Array ell_row_ptrs{exec, num_rows + 1}; + Array coo_row_ptrs{exec, num_rows + 1}; + exec->run(hybrid::make_ell_count_nonzeros_per_row( + this->get_ell(), ell_row_ptrs.get_data())); + exec->run( + hybrid::make_prefix_sum(ell_row_ptrs.get_data(), num_rows + 1)); + exec->run(hybrid::make_convert_idxs_to_ptrs( + this->get_const_coo_row_idxs(), this->get_coo_num_stored_elements(), + num_rows, coo_row_ptrs.get_data())); + const auto nnz = static_cast( + exec->copy_val_to_host(ell_row_ptrs.get_const_data() + num_rows) + + exec->copy_val_to_host(coo_row_ptrs.get_const_data() + num_rows)); + tmp->row_ptrs_.resize_and_reset(num_rows + 1); + tmp->col_idxs_.resize_and_reset(nnz); + tmp->values_.resize_and_reset(nnz); + tmp->set_size(this->get_size()); + exec->run(hybrid::make_convert_to_csr( + this, ell_row_ptrs.get_const_data(), coo_row_ptrs.get_const_data(), + tmp.get())); + } result->make_srow(); } @@ -193,6 +194,16 @@ void Hybrid::move_to(Csr* result) } +template +void Hybrid::resize(dim<2> new_size, + size_type ell_row_nnz, + size_type coo_nnz) +{ + ell_->resize(new_size, ell_row_nnz); + coo_->resize(new_size, coo_nnz); +} + + template void Hybrid::read(const device_mat_data& data) { @@ -244,7 +255,7 @@ void Hybrid::write(mat_data& data) const for (size_type i = 0; i < tmp->get_ell_num_stored_elements_per_row(); ++i) { const auto val = tmp->ell_val_at(row, i); - if (val != zero()) { + if (is_nonzero(val)) { const auto col = tmp->ell_col_at(row, i); data.nonzeros.emplace_back(row, col, val); } diff --git a/core/matrix/hybrid_kernels.hpp b/core/matrix/hybrid_kernels.hpp index 07670678a1d..9e9cc08f88d 100644 --- a/core/matrix/hybrid_kernels.hpp +++ b/core/matrix/hybrid_kernels.hpp @@ -51,6 +51,11 @@ namespace kernels { void compute_row_nnz(std::shared_ptr exec, \ const Array& row_ptrs, size_type* row_nnzs) +#define GKO_DECLARE_HYBRID_COMPUTE_COO_ROW_PTRS_KERNEL \ + void compute_coo_row_ptrs(std::shared_ptr exec, \ + const Array& row_nnz, \ + size_type ell_lim, int64* coo_row_ptrs) + #define GKO_DECLARE_HYBRID_SPLIT_MATRIX_DATA_KERNEL(ValueType, IndexType) \ void split_matrix_data( \ std::shared_ptr exec, \ @@ -66,8 +71,10 @@ namespace kernels { const IndexType* coo_row_ptrs, \ matrix::Csr* result) + #define GKO_DECLARE_ALL_AS_TEMPLATES \ GKO_DECLARE_HYBRID_COMPUTE_ROW_NNZ; \ + GKO_DECLARE_HYBRID_COMPUTE_COO_ROW_PTRS_KERNEL; \ template \ GKO_DECLARE_HYBRID_SPLIT_MATRIX_DATA_KERNEL(ValueType, IndexType); \ template \ diff --git a/core/matrix/sellp.cpp b/core/matrix/sellp.cpp index ad83738c25a..d2db34d3736 100644 --- a/core/matrix/sellp.cpp +++ b/core/matrix/sellp.cpp @@ -46,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/components/absolute_array_kernels.hpp" #include "core/components/device_matrix_data_kernels.hpp" #include "core/components/fill_array_kernels.hpp" +#include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/sellp_kernels.hpp" @@ -58,11 +59,12 @@ namespace { GKO_REGISTER_OPERATION(spmv, sellp::spmv); GKO_REGISTER_OPERATION(advanced_spmv, sellp::advanced_spmv); GKO_REGISTER_OPERATION(build_row_ptrs, components::build_row_ptrs); +GKO_REGISTER_OPERATION(prefix_sum, components::prefix_sum); GKO_REGISTER_OPERATION(compute_slice_sets, sellp::compute_slice_sets); GKO_REGISTER_OPERATION(fill_in_matrix_data, sellp::fill_in_matrix_data); GKO_REGISTER_OPERATION(fill_in_dense, sellp::fill_in_dense); GKO_REGISTER_OPERATION(convert_to_csr, sellp::convert_to_csr); -GKO_REGISTER_OPERATION(count_nonzeros, sellp::count_nonzeros); +GKO_REGISTER_OPERATION(count_nonzeros_per_row, sellp::count_nonzeros_per_row); GKO_REGISTER_OPERATION(extract_diagonal, sellp::extract_diagonal); GKO_REGISTER_OPERATION(fill_array, components::fill_array); GKO_REGISTER_OPERATION(inplace_absolute_array, @@ -144,14 +146,22 @@ void Sellp::convert_to( Csr* result) const { auto exec = this->get_executor(); - - size_type num_stored_nonzeros = 0; - exec->run(sellp::make_count_nonzeros(this, &num_stored_nonzeros)); - auto tmp = Csr::create( - exec, this->get_size(), num_stored_nonzeros, result->get_strategy()); - exec->run(sellp::make_convert_to_csr(this, tmp.get())); - tmp->make_srow(); - tmp->move_to(result); + const auto num_rows = this->get_size()[0]; + { + auto tmp = make_temporary_clone(exec, result); + tmp->row_ptrs_.resize_and_reset(num_rows + 1); + exec->run(sellp::make_count_nonzeros_per_row( + this, tmp->row_ptrs_.get_data())); + exec->run( + sellp::make_prefix_sum(tmp->row_ptrs_.get_data(), num_rows + 1)); + const auto nnz = static_cast( + exec->copy_val_to_host(tmp->row_ptrs_.get_const_data() + num_rows)); + tmp->col_idxs_.resize_and_reset(nnz); + tmp->values_.resize_and_reset(nnz); + tmp->set_size(this->get_size()); + exec->run(sellp::make_convert_to_csr(this, tmp.get())); + } + result->make_srow(); } @@ -214,7 +224,7 @@ void Sellp::write(mat_data& data) const i++) { const auto val = tmp->val_at( row_in_slice, tmp->get_const_slice_sets()[slice], i); - if (val != zero()) { + if (is_nonzero(val)) { const auto col = tmp->col_at(row_in_slice, tmp->get_const_slice_sets()[slice], i); diff --git a/core/matrix/sellp_kernels.hpp b/core/matrix/sellp_kernels.hpp index 39d8a5007b5..ad2e8ea3ade 100644 --- a/core/matrix/sellp_kernels.hpp +++ b/core/matrix/sellp_kernels.hpp @@ -68,9 +68,9 @@ namespace kernels { const Array>& data, \ const int64* row_ptrs, matrix::Sellp* output) -#define GKO_DECLARE_SELLP_COMPUTE_SLICE_SETS \ +#define GKO_DECLARE_SELLP_COMPUTE_SLICE_SETS_KERNEL(IndexType) \ void compute_slice_sets(std::shared_ptr exec, \ - const Array& row_ptrs, \ + const Array& row_ptrs, \ size_type slice_size, size_type stride_factor, \ size_type* slice_sets, size_type* slice_lengths) @@ -84,31 +84,32 @@ namespace kernels { const matrix::Sellp* source, \ matrix::Csr* result) -#define GKO_DECLARE_SELLP_COUNT_NONZEROS_KERNEL(ValueType, IndexType) \ - void count_nonzeros(std::shared_ptr exec, \ - const matrix::Sellp* source, \ - size_type* result) +#define GKO_DECLARE_SELLP_COUNT_NONZEROS_PER_ROW_KERNEL(ValueType, IndexType) \ + void count_nonzeros_per_row( \ + std::shared_ptr exec, \ + const matrix::Sellp* source, IndexType* result) #define GKO_DECLARE_SELLP_EXTRACT_DIAGONAL_KERNEL(ValueType, IndexType) \ void extract_diagonal(std::shared_ptr exec, \ const matrix::Sellp* orig, \ matrix::Diagonal* diag) -#define GKO_DECLARE_ALL_AS_TEMPLATES \ - template \ - GKO_DECLARE_SELLP_SPMV_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_SELLP_ADVANCED_SPMV_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_SELLP_FILL_IN_MATRIX_DATA_KERNEL(ValueType, IndexType); \ - GKO_DECLARE_SELLP_COMPUTE_SLICE_SETS; \ - template \ - GKO_DECLARE_SELLP_FILL_IN_DENSE_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_SELLP_CONVERT_TO_CSR_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_SELLP_COUNT_NONZEROS_KERNEL(ValueType, IndexType); \ - template \ +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + template \ + GKO_DECLARE_SELLP_SPMV_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_SELLP_ADVANCED_SPMV_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_SELLP_FILL_IN_MATRIX_DATA_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_SELLP_COMPUTE_SLICE_SETS_KERNEL(IndexType); \ + template \ + GKO_DECLARE_SELLP_FILL_IN_DENSE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_SELLP_CONVERT_TO_CSR_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_SELLP_COUNT_NONZEROS_PER_ROW_KERNEL(ValueType, IndexType); \ + template \ GKO_DECLARE_SELLP_EXTRACT_DIAGONAL_KERNEL(ValueType, IndexType) diff --git a/core/matrix/sparsity_csr.cpp b/core/matrix/sparsity_csr.cpp index 144f1b4ec6a..6cb9b598403 100644 --- a/core/matrix/sparsity_csr.cpp +++ b/core/matrix/sparsity_csr.cpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include @@ -58,6 +59,7 @@ GKO_REGISTER_OPERATION(count_num_diagonal_elements, sparsity_csr::count_num_diagonal_elements); GKO_REGISTER_OPERATION(build_row_ptrs, components::build_row_ptrs); GKO_REGISTER_OPERATION(fill_in_matrix_data, sparsity_csr::fill_in_matrix_data); +GKO_REGISTER_OPERATION(fill_in_dense, sparsity_csr::fill_in_dense); GKO_REGISTER_OPERATION(remove_diagonal_elements, sparsity_csr::remove_diagonal_elements); GKO_REGISTER_OPERATION(sort_by_column_index, @@ -98,6 +100,47 @@ void SparsityCsr::apply_impl(const LinOp* alpha, } +template +void SparsityCsr::convert_to( + Csr* result) const +{ + result->row_ptrs_ = this->row_ptrs_; + result->col_idxs_ = this->col_idxs_; + result->values_.resize_and_reset(this->get_num_nonzeros()); + result->values_.fill( + this->get_executor()->copy_val_to_host(this->get_const_value())); + result->set_size(this->get_size()); + result->make_srow(); +} + + +template +void SparsityCsr::move_to( + Csr* result) +{ + this->convert_to(result); +} + + +template +void SparsityCsr::convert_to( + Dense* result) const +{ + auto exec = this->get_executor(); + auto tmp = make_temporary_clone(exec, result); + tmp->resize(this->get_size()); + tmp->fill(zero()); + exec->run(sparsity_csr::make_fill_in_dense(this, tmp.get())); +} + + +template +void SparsityCsr::move_to(Dense* result) +{ + this->convert_to(result); +} + + template void SparsityCsr::read(const device_mat_data& data) { diff --git a/core/matrix/sparsity_csr_kernels.hpp b/core/matrix/sparsity_csr_kernels.hpp index 828eda4b526..fafe0dd7214 100644 --- a/core/matrix/sparsity_csr_kernels.hpp +++ b/core/matrix/sparsity_csr_kernels.hpp @@ -61,6 +61,11 @@ namespace kernels { const matrix::Dense* beta, \ matrix::Dense* c) +#define GKO_DECLARE_SPARSITY_CSR_FILL_IN_DENSE_KERNEL(ValueType, IndexType) \ + void fill_in_dense(std::shared_ptr exec, \ + const matrix::SparsityCsr* input, \ + matrix::Dense* output) + #define GKO_DECLARE_SPARSITY_CSR_FILL_IN_MATRIX_DATA_KERNEL(ValueType, \ IndexType) \ void fill_in_matrix_data( \ @@ -105,6 +110,8 @@ namespace kernels { template \ GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL(ValueType, IndexType); \ template \ + GKO_DECLARE_SPARSITY_CSR_FILL_IN_DENSE_KERNEL(ValueType, IndexType); \ + template \ GKO_DECLARE_SPARSITY_CSR_FILL_IN_MATRIX_DATA_KERNEL(ValueType, IndexType); \ template \ GKO_DECLARE_SPARSITY_CSR_REMOVE_DIAGONAL_ELEMENTS_KERNEL(ValueType, \ diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.cu index ab72c44a018..5987898ee9f 100644 --- a/cuda/matrix/csr_kernels.cu +++ b/cuda/matrix/csr_kernels.cu @@ -58,6 +58,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cuda/base/types.hpp" #include "cuda/components/atomic.cuh" #include "cuda/components/cooperative_groups.cuh" +#include "cuda/components/format_conversion.cuh" #include "cuda/components/intrinsics.cuh" #include "cuda/components/merging.cuh" #include "cuda/components/reduction.cuh" @@ -869,151 +870,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL); -template -void convert_to_sellp(std::shared_ptr exec, - const matrix::Csr* source, - matrix::Sellp* result) -{ - const auto num_rows = result->get_size()[0]; - const auto num_cols = result->get_size()[1]; - - auto result_values = result->get_values(); - auto result_col_idxs = result->get_col_idxs(); - auto slice_lengths = result->get_slice_lengths(); - auto slice_sets = result->get_slice_sets(); - - const auto slice_size = (result->get_slice_size() == 0) - ? matrix::default_slice_size - : result->get_slice_size(); - const auto stride_factor = (result->get_stride_factor() == 0) - ? matrix::default_stride_factor - : result->get_stride_factor(); - const int slice_num = ceildiv(num_rows, slice_size); - - const auto source_values = source->get_const_values(); - const auto source_row_ptrs = source->get_const_row_ptrs(); - const auto source_col_idxs = source->get_const_col_idxs(); - - auto nnz_per_row = Array(exec, num_rows); - auto grid_dim = ceildiv(num_rows, default_block_size); - - if (grid_dim > 0) { - kernel::calculate_nnz_per_row<<>>( - num_rows, as_cuda_type(source_row_ptrs), - as_cuda_type(nnz_per_row.get_data())); - } - - grid_dim = slice_num; - - if (grid_dim > 0) { - kernel::calculate_slice_lengths<<>>( - num_rows, slice_size, stride_factor, - as_cuda_type(nnz_per_row.get_const_data()), - as_cuda_type(slice_lengths), as_cuda_type(slice_sets)); - } - - components::prefix_sum(exec, slice_sets, slice_num + 1); - - grid_dim = ceildiv(num_rows, default_block_size); - if (grid_dim > 0) { - kernel::fill_in_sellp<<>>( - num_rows, slice_size, as_cuda_type(source_values), - as_cuda_type(source_row_ptrs), as_cuda_type(source_col_idxs), - as_cuda_type(slice_lengths), as_cuda_type(slice_sets), - as_cuda_type(result_col_idxs), as_cuda_type(result_values)); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CONVERT_TO_SELLP_KERNEL); - - -template -void convert_to_ell(std::shared_ptr exec, - const matrix::Csr* source, - matrix::Ell* result) -{ - const auto source_values = source->get_const_values(); - const auto source_row_ptrs = source->get_const_row_ptrs(); - const auto source_col_idxs = source->get_const_col_idxs(); - - auto result_values = result->get_values(); - auto result_col_idxs = result->get_col_idxs(); - const auto stride = result->get_stride(); - const auto max_nnz_per_row = result->get_num_stored_elements_per_row(); - const auto num_rows = result->get_size()[0]; - const auto num_cols = result->get_size()[1]; - - const auto init_grid_dim = - ceildiv(max_nnz_per_row * num_rows, default_block_size); - - kernel::initialize_zero_ell<<>>( - max_nnz_per_row, stride, as_cuda_type(result_values), - as_cuda_type(result_col_idxs)); - - const auto grid_dim = - ceildiv(num_rows * config::warp_size, default_block_size); - - kernel::fill_in_ell<<>>( - num_rows, stride, as_cuda_type(source_values), - as_cuda_type(source_row_ptrs), as_cuda_type(source_col_idxs), - as_cuda_type(result_values), as_cuda_type(result_col_idxs)); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CONVERT_TO_ELL_KERNEL); - - -template -void calculate_total_cols(std::shared_ptr exec, - const matrix::Csr* source, - size_type* result, size_type stride_factor, - size_type slice_size) -{ - const auto num_rows = source->get_size()[0]; - - if (num_rows == 0) { - *result = 0; - return; - } - - const auto slice_num = ceildiv(num_rows, slice_size); - const auto row_ptrs = source->get_const_row_ptrs(); - - auto nnz_per_row = Array(exec, num_rows); - auto grid_dim = ceildiv(num_rows, default_block_size); - - kernel::calculate_nnz_per_row<<>>( - num_rows, as_cuda_type(row_ptrs), as_cuda_type(nnz_per_row.get_data())); - - grid_dim = ceildiv(slice_num * config::warp_size, default_block_size); - auto max_nnz_per_slice = Array(exec, slice_num); - - kernel::reduce_max_nnz_per_slice<<>>( - num_rows, slice_size, stride_factor, - as_cuda_type(nnz_per_row.get_const_data()), - as_cuda_type(max_nnz_per_slice.get_data())); - - grid_dim = ceildiv(slice_num, default_block_size); - auto block_results = Array(exec, grid_dim); - - kernel::reduce_total_cols<<>>( - slice_num, as_cuda_type(max_nnz_per_slice.get_const_data()), - as_cuda_type(block_results.get_data())); - - auto d_result = Array(exec, 1); - - kernel::reduce_total_cols<<<1, default_block_size>>>( - grid_dim, as_cuda_type(block_results.get_const_data()), - as_cuda_type(d_result.get_data())); - - *result = exec->copy_val_to_host(d_result.get_const_data()); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CALCULATE_TOTAL_COLS_KERNEL); - - template void transpose(std::shared_ptr exec, const matrix::Csr* orig, @@ -1196,39 +1052,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL); -template -void calculate_max_nnz_per_row(std::shared_ptr exec, - const matrix::Csr* source, - size_type* result) -{ - const auto num_rows = source->get_size()[0]; - - auto nnz_per_row = Array(exec, num_rows); - auto block_results = Array(exec, default_block_size); - auto d_result = Array(exec, 1); - - const auto grid_dim = ceildiv(num_rows, default_block_size); - kernel::calculate_nnz_per_row<<>>( - num_rows, as_cuda_type(source->get_const_row_ptrs()), - as_cuda_type(nnz_per_row.get_data())); - - const auto n = ceildiv(num_rows, default_block_size); - const auto reduce_dim = n <= default_block_size ? n : default_block_size; - kernel::reduce_max_nnz<<>>( - num_rows, as_cuda_type(nnz_per_row.get_const_data()), - as_cuda_type(block_results.get_data())); - - kernel::reduce_max_nnz<<<1, default_block_size>>>( - reduce_dim, as_cuda_type(block_results.get_const_data()), - as_cuda_type(d_result.get_data())); - - *result = exec->copy_val_to_host(d_result.get_const_data()); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL); - - template void calculate_nonzeros_per_row_in_span( std::shared_ptr exec, @@ -1278,65 +1101,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL); -template -void convert_to_hybrid(std::shared_ptr exec, - const matrix::Csr* source, - matrix::Hybrid* result) -{ - auto ell_val = result->get_ell_values(); - auto ell_col = result->get_ell_col_idxs(); - auto coo_val = result->get_coo_values(); - auto coo_col = result->get_coo_col_idxs(); - auto coo_row = result->get_coo_row_idxs(); - const auto stride = result->get_ell_stride(); - const auto max_nnz_per_row = result->get_ell_num_stored_elements_per_row(); - const auto num_rows = result->get_size()[0]; - const auto coo_num_stored_elements = result->get_coo_num_stored_elements(); - auto grid_dim = ceildiv(max_nnz_per_row * num_rows, default_block_size); - - kernel::initialize_zero_ell<<>>( - max_nnz_per_row, stride, as_cuda_type(ell_val), as_cuda_type(ell_col)); - - grid_dim = ceildiv(num_rows, default_block_size); - auto coo_offset = Array(exec, num_rows); - kernel::calculate_hybrid_coo_row_nnz<<>>( - num_rows, max_nnz_per_row, as_cuda_type(source->get_const_row_ptrs()), - as_cuda_type(coo_offset.get_data())); - - components::prefix_sum(exec, coo_offset.get_data(), num_rows); - - grid_dim = ceildiv(num_rows * config::warp_size, default_block_size); - kernel::fill_in_hybrid<<>>( - num_rows, stride, max_nnz_per_row, - as_cuda_type(source->get_const_values()), - as_cuda_type(source->get_const_row_ptrs()), - as_cuda_type(source->get_const_col_idxs()), - as_cuda_type(coo_offset.get_const_data()), as_cuda_type(ell_val), - as_cuda_type(ell_col), as_cuda_type(coo_val), as_cuda_type(coo_col), - as_cuda_type(coo_row)); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL); - - -template -void calculate_nonzeros_per_row(std::shared_ptr exec, - const matrix::Csr* source, - Array* result) -{ - const auto num_rows = source->get_size()[0]; - auto row_ptrs = source->get_const_row_ptrs(); - auto grid_dim = ceildiv(num_rows, default_block_size); - - kernel::calculate_nnz_per_row<<>>( - num_rows, as_cuda_type(row_ptrs), as_cuda_type(result->get_data())); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CALCULATE_NONZEROS_PER_ROW_KERNEL); - - template void sort_by_column_index(std::shared_ptr exec, matrix::Csr* to_sort) diff --git a/cuda/matrix/dense_kernels.cu b/cuda/matrix/dense_kernels.cu index 083b3de407f..9ee63d0954f 100644 --- a/cuda/matrix/dense_kernels.cu +++ b/cuda/matrix/dense_kernels.cu @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include @@ -48,6 +49,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cuda/base/cublas_bindings.hpp" #include "cuda/base/pointer_mode_guard.hpp" #include "cuda/components/cooperative_groups.cuh" +#include "cuda/components/intrinsics.cuh" #include "cuda/components/reduction.cuh" #include "cuda/components/thread_ids.cuh" #include "cuda/components/uninitialized_array.hpp" @@ -130,6 +132,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL); template void convert_to_coo(std::shared_ptr exec, const matrix::Dense* source, + const int64* row_ptrs, matrix::Coo* result) { auto num_rows = result->get_size()[0]; @@ -141,18 +144,14 @@ void convert_to_coo(std::shared_ptr exec, auto stride = source->get_stride(); - auto nnz_prefix_sum = Array(exec, num_rows); - calculate_nonzeros_per_row(exec, source, &nnz_prefix_sum); - - components::prefix_sum(exec, nnz_prefix_sum.get_data(), num_rows); - - size_type grid_dim = ceildiv(num_rows, default_block_size); - - kernel::fill_in_coo<<>>( - num_rows, num_cols, stride, - as_cuda_type(nnz_prefix_sum.get_const_data()), - as_cuda_type(source->get_const_values()), as_cuda_type(row_idxs), - as_cuda_type(col_idxs), as_cuda_type(values)); + const auto grid_dim = + ceildiv(num_rows, default_block_size / config::warp_size); + if (grid_dim > 0) { + kernel::fill_in_coo<<>>( + num_rows, num_cols, stride, + as_cuda_type(source->get_const_values()), row_ptrs, row_idxs, + col_idxs, as_cuda_type(values)); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -173,20 +172,14 @@ void convert_to_csr(std::shared_ptr exec, auto stride = source->get_stride(); - const auto rows_per_block = ceildiv(default_block_size, config::warp_size); - const auto grid_dim_nnz = ceildiv(source->get_size()[0], rows_per_block); - - kernel::count_nnz_per_row<<>>( - num_rows, num_cols, stride, as_cuda_type(source->get_const_values()), - as_cuda_type(row_ptrs)); - - components::prefix_sum(exec, row_ptrs, num_rows + 1); - - size_type grid_dim = ceildiv(num_rows, default_block_size); - - kernel::fill_in_csr<<>>( - num_rows, num_cols, stride, as_cuda_type(source->get_const_values()), - as_cuda_type(row_ptrs), as_cuda_type(col_idxs), as_cuda_type(values)); + const auto grid_dim = + ceildiv(num_rows, default_block_size / config::warp_size); + if (grid_dim > 0) { + kernel::fill_in_csr<<>>( + num_rows, num_cols, stride, + as_cuda_type(source->get_const_values()), row_ptrs, col_idxs, + as_cuda_type(values)); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -202,17 +195,19 @@ void convert_to_ell(std::shared_ptr exec, auto num_cols = result->get_size()[1]; auto max_nnz_per_row = result->get_num_stored_elements_per_row(); - auto col_ptrs = result->get_col_idxs(); + auto col_idxs = result->get_col_idxs(); auto values = result->get_values(); auto source_stride = source->get_stride(); auto result_stride = result->get_stride(); - auto grid_dim = ceildiv(result_stride, default_block_size); - kernel::fill_in_ell<<>>( - num_rows, num_cols, source_stride, - as_cuda_type(source->get_const_values()), max_nnz_per_row, - result_stride, as_cuda_type(col_ptrs), as_cuda_type(values)); + auto grid_dim = ceildiv(num_rows, default_block_size / config::warp_size); + if (grid_dim > 0) { + kernel::fill_in_ell<<>>( + num_rows, num_cols, source_stride, + as_cuda_type(source->get_const_values()), max_nnz_per_row, + result_stride, col_idxs, as_cuda_type(values)); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -222,8 +217,30 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void convert_to_hybrid(std::shared_ptr exec, const matrix::Dense* source, + const int64* coo_row_ptrs, matrix::Hybrid* result) - GKO_NOT_IMPLEMENTED; +{ + const auto num_rows = result->get_size()[0]; + const auto num_cols = result->get_size()[1]; + const auto ell_max_nnz_per_row = + result->get_ell_num_stored_elements_per_row(); + const auto source_stride = source->get_stride(); + const auto ell_stride = result->get_ell_stride(); + auto ell_col_idxs = result->get_ell_col_idxs(); + auto ell_values = result->get_ell_values(); + auto coo_row_idxs = result->get_coo_row_idxs(); + auto coo_col_idxs = result->get_coo_col_idxs(); + auto coo_values = result->get_coo_values(); + + auto grid_dim = ceildiv(num_rows, default_block_size / config::warp_size); + if (grid_dim > 0) { + kernel::fill_in_hybrid<<>>( + num_rows, num_cols, source_stride, + as_cuda_type(source->get_const_values()), ell_max_nnz_per_row, + ell_stride, ell_col_idxs, as_cuda_type(ell_values), coo_row_ptrs, + coo_row_idxs, coo_col_idxs, as_cuda_type(coo_values)); + } +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DENSE_CONVERT_TO_HYBRID_KERNEL); @@ -242,30 +259,9 @@ void convert_to_sellp(std::shared_ptr exec, auto col_idxs = result->get_col_idxs(); auto slice_lengths = result->get_slice_lengths(); auto slice_sets = result->get_slice_sets(); + const auto slice_size = result->get_slice_size(); - const auto slice_size = (result->get_slice_size() == 0) - ? matrix::default_slice_size - : result->get_slice_size(); - const auto stride_factor = (result->get_stride_factor() == 0) - ? matrix::default_stride_factor - : result->get_stride_factor(); - const int slice_num = ceildiv(num_rows, slice_size); - - auto nnz_per_row = Array(exec, num_rows); - calculate_nonzeros_per_row(exec, source, &nnz_per_row); - - auto grid_dim = slice_num; - - if (grid_dim > 0) { - kernel::calculate_slice_lengths<<>>( - num_rows, slice_size, slice_num, stride_factor, - as_cuda_type(nnz_per_row.get_const_data()), - as_cuda_type(slice_lengths), as_cuda_type(slice_sets)); - } - - components::prefix_sum(exec, slice_sets, slice_num + 1); - - grid_dim = ceildiv(num_rows, default_block_size); + auto grid_dim = ceildiv(num_rows, default_block_size / config::warp_size); if (grid_dim > 0) { kernel::fill_in_sellp<<>>( num_rows, num_cols, slice_size, stride, @@ -289,128 +285,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DENSE_CONVERT_TO_SPARSITY_CSR_KERNEL); -template -void count_nonzeros(std::shared_ptr exec, - const matrix::Dense* source, size_type* result) -{ - const auto num_rows = source->get_size()[0]; - auto nnz_per_row = Array(exec, num_rows); - - calculate_nonzeros_per_row(exec, source, &nnz_per_row); - - *result = reduce_add_array(exec, num_rows, nnz_per_row.get_const_data()); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COUNT_NONZEROS_KERNEL); - - -template -void calculate_max_nnz_per_row(std::shared_ptr exec, - const matrix::Dense* source, - size_type* result) -{ - const auto num_rows = source->get_size()[0]; - auto nnz_per_row = Array(exec, num_rows); - - calculate_nonzeros_per_row(exec, source, &nnz_per_row); - - const auto n = ceildiv(num_rows, default_block_size); - const size_type grid_dim = - (n <= default_block_size) ? n : default_block_size; - - auto block_results = Array(exec, grid_dim); - - kernel::reduce_max_nnz<<>>( - num_rows, as_cuda_type(nnz_per_row.get_const_data()), - as_cuda_type(block_results.get_data())); - - auto d_result = Array(exec, 1); - - kernel::reduce_max_nnz<<<1, default_block_size, - default_block_size * sizeof(size_type)>>>( - grid_dim, as_cuda_type(block_results.get_const_data()), - as_cuda_type(d_result.get_data())); - - *result = exec->copy_val_to_host(d_result.get_const_data()); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_CALCULATE_MAX_NNZ_PER_ROW_KERNEL); - - -template -void calculate_nonzeros_per_row(std::shared_ptr exec, - const matrix::Dense* source, - Array* result) -{ - const dim3 block_size(default_block_size, 1, 1); - auto rows_per_block = ceildiv(default_block_size, config::warp_size); - const size_t grid_x = ceildiv(source->get_size()[0], rows_per_block); - const dim3 grid_size(grid_x, 1, 1); - if (grid_x > 0) { - kernel::count_nnz_per_row<<>>( - source->get_size()[0], source->get_size()[1], source->get_stride(), - as_cuda_type(source->get_const_values()), - as_cuda_type(result->get_data())); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_CALCULATE_NONZEROS_PER_ROW_KERNEL); - - -template -void calculate_total_cols(std::shared_ptr exec, - const matrix::Dense* source, - size_type* result, size_type stride_factor, - size_type slice_size) -{ - const auto num_rows = source->get_size()[0]; - - if (num_rows == 0) { - *result = 0; - return; - } - - const auto num_cols = source->get_size()[1]; - const auto slice_num = ceildiv(num_rows, slice_size); - - auto nnz_per_row = Array(exec, num_rows); - - calculate_nonzeros_per_row(exec, source, &nnz_per_row); - - auto max_nnz_per_slice = Array(exec, slice_num); - - auto grid_dim = ceildiv(slice_num * config::warp_size, default_block_size); - - kernel::reduce_max_nnz_per_slice<<>>( - num_rows, slice_size, stride_factor, - as_cuda_type(nnz_per_row.get_const_data()), - as_cuda_type(max_nnz_per_slice.get_data())); - - grid_dim = ceildiv(slice_num, default_block_size); - auto block_results = Array(exec, grid_dim); - - kernel::reduce_total_cols<<>>( - slice_num, as_cuda_type(max_nnz_per_slice.get_const_data()), - as_cuda_type(block_results.get_data())); - - auto d_result = Array(exec, 1); - - kernel::reduce_total_cols<<<1, default_block_size, - default_block_size * sizeof(size_type)>>>( - grid_dim, as_cuda_type(block_results.get_const_data()), - as_cuda_type(d_result.get_data())); - - *result = exec->copy_val_to_host(d_result.get_const_data()); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_CALCULATE_TOTAL_COLS_KERNEL); - - template void transpose(std::shared_ptr exec, const matrix::Dense* orig, diff --git a/cuda/matrix/ell_kernels.cu b/cuda/matrix/ell_kernels.cu index 4c7db28eb37..8537e44c831 100644 --- a/cuda/matrix/ell_kernels.cu +++ b/cuda/matrix/ell_kernels.cu @@ -287,30 +287,6 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL); -template -void extract_diagonal(std::shared_ptr exec, - const matrix::Ell* orig, - matrix::Diagonal* diag) -{ - const auto max_nnz_per_row = orig->get_num_stored_elements_per_row(); - const auto orig_stride = orig->get_stride(); - const auto diag_size = diag->get_size()[0]; - const auto num_blocks = - ceildiv(diag_size * max_nnz_per_row, default_block_size); - - const auto orig_values = orig->get_const_values(); - const auto orig_col_idxs = orig->get_const_col_idxs(); - auto diag_values = diag->get_values(); - - kernel::extract_diagonal<<>>( - diag_size, max_nnz_per_row, orig_stride, as_cuda_type(orig_values), - as_cuda_type(orig_col_idxs), as_cuda_type(diag_values)); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_ELL_EXTRACT_DIAGONAL_KERNEL); - - } // namespace ell } // namespace cuda } // namespace kernels diff --git a/cuda/matrix/fbcsr_kernels.cu b/cuda/matrix/fbcsr_kernels.cu index eebec3cc574..6926e5d828f 100644 --- a/cuda/matrix/fbcsr_kernels.cu +++ b/cuda/matrix/fbcsr_kernels.cu @@ -240,7 +240,16 @@ template void convert_to_csr(const std::shared_ptr exec, const matrix::Fbcsr* const source, matrix::Csr* const result) - GKO_NOT_IMPLEMENTED; +{ + constexpr auto warps_per_block = default_block_size / config::warp_size; + const auto num_blocks = + ceildiv(source->get_num_block_rows(), warps_per_block); + kernel::convert_to_csr<<>>( + source->get_const_row_ptrs(), source->get_const_col_idxs(), + as_cuda_type(source->get_const_values()), result->get_row_ptrs(), + result->get_col_idxs(), as_cuda_type(result->get_values()), + source->get_num_block_rows(), source->get_block_size()); +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_FBCSR_CONVERT_TO_CSR_KERNEL); @@ -324,49 +333,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_FBCSR_CONJ_TRANSPOSE_KERNEL); -template -void calculate_max_nnz_per_row( - std::shared_ptr exec, - const matrix::Fbcsr* const source, - size_type* const result) -{ - const auto num_b_rows = source->get_num_block_rows(); - const auto bs = source->get_block_size(); - - auto nnz_per_row = Array(exec, num_b_rows); - auto block_results = Array(exec, default_block_size); - auto d_result = Array(exec, 1); - - const auto grid_dim = ceildiv(num_b_rows, default_block_size); - csr_reuse::kernel::calculate_nnz_per_row<<>>( - num_b_rows, as_cuda_type(source->get_const_row_ptrs()), - nnz_per_row.get_data()); - - const auto n = ceildiv(num_b_rows, default_block_size); - const auto reduce_dim = n <= default_block_size ? n : default_block_size; - csr_reuse::kernel::reduce_max_nnz<<>>( - num_b_rows, nnz_per_row.get_const_data(), block_results.get_data()); - - csr_reuse::kernel::reduce_max_nnz<<<1, default_block_size>>>( - reduce_dim, block_results.get_const_data(), d_result.get_data()); - - *result = bs * exec->copy_val_to_host(d_result.get_const_data()); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_FBCSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL); - - -template -void calculate_nonzeros_per_row( - std::shared_ptr exec, - const matrix::Fbcsr* source, - Array* result) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_FBCSR_CALCULATE_NONZEROS_PER_ROW_KERNEL); - - template void is_sorted_by_column_index( std::shared_ptr exec, diff --git a/cuda/matrix/hybrid_kernels.cu b/cuda/matrix/hybrid_kernels.cu index 9db0cddce2a..f1b05c4f605 100644 --- a/cuda/matrix/hybrid_kernels.cu +++ b/cuda/matrix/hybrid_kernels.cu @@ -40,24 +40,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include -#include - - #include "common/unified/base/kernel_launch.hpp" -#include "core/components/fill_array_kernels.hpp" -#include "core/components/format_conversion_kernels.hpp" -#include "core/components/prefix_sum_kernels.hpp" -#include "core/matrix/coo_kernels.hpp" -#include "core/matrix/ell_kernels.hpp" -#include "cuda/base/config.hpp" #include "cuda/base/types.hpp" -#include "cuda/components/atomic.cuh" -#include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/format_conversion.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/segment_scan.cuh" -#include "cuda/components/thread_ids.cuh" namespace gko { @@ -71,10 +55,6 @@ namespace cuda { namespace hybrid { -constexpr int default_block_size = 512; -constexpr int warps_in_block = 4; - - #include "common/cuda_hip/matrix/hybrid_kernels.hpp.inc" diff --git a/cuda/matrix/sellp_kernels.cu b/cuda/matrix/sellp_kernels.cu index a37f673c549..15683d4d3e5 100644 --- a/cuda/matrix/sellp_kernels.cu +++ b/cuda/matrix/sellp_kernels.cu @@ -109,109 +109,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_SELLP_ADVANCED_SPMV_KERNEL); -template -void convert_to_csr(std::shared_ptr exec, - const matrix::Sellp* source, - matrix::Csr* result) -{ - const auto num_rows = source->get_size()[0]; - const auto slice_size = source->get_slice_size(); - const auto slice_num = ceildiv(num_rows, slice_size); - - const auto source_values = source->get_const_values(); - const auto source_slice_lengths = source->get_const_slice_lengths(); - const auto source_slice_sets = source->get_const_slice_sets(); - const auto source_col_idxs = source->get_const_col_idxs(); - - auto result_values = result->get_values(); - auto result_col_idxs = result->get_col_idxs(); - auto result_row_ptrs = result->get_row_ptrs(); - - auto grid_dim = ceildiv(num_rows * config::warp_size, default_block_size); - - if (grid_dim > 0) { - kernel::count_nnz_per_row<<>>( - num_rows, slice_size, as_cuda_type(source_slice_sets), - as_cuda_type(source_values), as_cuda_type(result_row_ptrs)); - } - - grid_dim = ceildiv(num_rows + 1, default_block_size); - auto add_values = Array(exec, grid_dim); - - components::prefix_sum(exec, result_row_ptrs, num_rows + 1); - - grid_dim = ceildiv(num_rows, default_block_size); - - if (grid_dim > 0) { - kernel::fill_in_csr<<>>( - num_rows, slice_size, as_cuda_type(source_slice_sets), - as_cuda_type(source_col_idxs), as_cuda_type(source_values), - as_cuda_type(result_row_ptrs), as_cuda_type(result_col_idxs), - as_cuda_type(result_values)); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_SELLP_CONVERT_TO_CSR_KERNEL); - - -template -void count_nonzeros(std::shared_ptr exec, - const matrix::Sellp* source, - size_type* result) -{ - const auto num_rows = source->get_size()[0]; - - if (num_rows <= 0) { - *result = 0; - return; - } - - const auto slice_size = source->get_slice_size(); - const auto slice_sets = source->get_const_slice_sets(); - const auto values = source->get_const_values(); - - auto nnz_per_row = Array(exec, num_rows); - - auto grid_dim = ceildiv(num_rows * config::warp_size, default_block_size); - - kernel::count_nnz_per_row<<>>( - num_rows, slice_size, as_cuda_type(slice_sets), as_cuda_type(values), - as_cuda_type(nnz_per_row.get_data())); - - *result = reduce_add_array(exec, num_rows, nnz_per_row.get_const_data()); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_SELLP_COUNT_NONZEROS_KERNEL); - - -template -void extract_diagonal(std::shared_ptr exec, - const matrix::Sellp* orig, - matrix::Diagonal* diag) -{ - const auto diag_size = diag->get_size()[0]; - const auto slice_size = orig->get_slice_size(); - const auto slice_num = ceildiv(diag_size, slice_size); - const auto num_blocks = - ceildiv(slice_num * config::warp_size, default_block_size); - - const auto orig_slice_sets = orig->get_const_slice_sets(); - const auto orig_values = orig->get_const_values(); - const auto orig_col_idxs = orig->get_const_col_idxs(); - auto diag_values = diag->get_values(); - - kernel::extract_diagonal<<>>( - diag_size, slice_size, as_cuda_type(orig_slice_sets), - as_cuda_type(orig_values), as_cuda_type(orig_col_idxs), - as_cuda_type(diag_values)); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_SELLP_EXTRACT_DIAGONAL_KERNEL); - - } // namespace sellp } // namespace cuda } // namespace kernels diff --git a/cuda/matrix/sparsity_csr_kernels.cu b/cuda/matrix/sparsity_csr_kernels.cu index 8a0dc2edbdb..30a778572ad 100644 --- a/cuda/matrix/sparsity_csr_kernels.cu +++ b/cuda/matrix/sparsity_csr_kernels.cu @@ -69,6 +69,15 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL); +template +void fill_in_dense(std::shared_ptr exec, + const matrix::SparsityCsr* input, + matrix::Dense* output) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_SPARSITY_CSR_FILL_IN_DENSE_KERNEL); + + template void count_num_diagonal_elements( std::shared_ptr exec, diff --git a/cuda/multigrid/amgx_pgm_kernels.cu b/cuda/multigrid/amgx_pgm_kernels.cu index 8ff29dd496e..fb6f5c4b50d 100644 --- a/cuda/multigrid/amgx_pgm_kernels.cu +++ b/cuda/multigrid/amgx_pgm_kernels.cu @@ -49,7 +49,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/components/fill_array_kernels.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/csr_builder.hpp" -#include "core/matrix/csr_kernels.hpp" #include "cuda/base/cusparse_bindings.hpp" #include "cuda/base/math.hpp" #include "cuda/base/types.hpp" diff --git a/cuda/test/matrix/coo_kernels.cpp b/cuda/test/matrix/coo_kernels.cpp index 68517be18f8..64b9acba61b 100644 --- a/cuda/test/matrix/coo_kernels.cpp +++ b/cuda/test/matrix/coo_kernels.cpp @@ -336,7 +336,7 @@ TEST_F(Coo, ConvertToDenseIsEquivalentToRef) mtx->convert_to(dense_mtx.get()); dmtx->convert_to(ddense_mtx.get()); - GKO_ASSERT_MTX_NEAR(dense_mtx.get(), ddense_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(dense_mtx.get(), ddense_mtx.get(), 0); } @@ -351,7 +351,7 @@ TEST_F(Coo, ConvertToCsrIsEquivalentToRef) dense_mtx->convert_to(csr_mtx.get()); dmtx->convert_to(dcsr_mtx.get()); - GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), 0); } diff --git a/cuda/test/matrix/csr_kernels.cpp b/cuda/test/matrix/csr_kernels.cpp index 48fc4dc6f5c..0b1936f99ff 100644 --- a/cuda/test/matrix/csr_kernels.cpp +++ b/cuda/test/matrix/csr_kernels.cpp @@ -520,7 +520,7 @@ TEST_F(Csr, ConvertToDenseIsEquivalentToRef) mtx->convert_to(dense_mtx.get()); dmtx->convert_to(ddense_mtx.get()); - GKO_ASSERT_MTX_NEAR(dense_mtx.get(), ddense_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(dense_mtx.get(), ddense_mtx.get(), 0); } @@ -533,7 +533,7 @@ TEST_F(Csr, MoveToDenseIsEquivalentToRef) mtx->move_to(dense_mtx.get()); dmtx->move_to(ddense_mtx.get()); - GKO_ASSERT_MTX_NEAR(dense_mtx.get(), ddense_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(dense_mtx.get(), ddense_mtx.get(), 0); } @@ -546,7 +546,7 @@ TEST_F(Csr, ConvertToEllIsEquivalentToRef) mtx->convert_to(ell_mtx.get()); dmtx->convert_to(dell_mtx.get()); - GKO_ASSERT_MTX_NEAR(ell_mtx.get(), dell_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(ell_mtx.get(), dell_mtx.get(), 0); } @@ -559,7 +559,7 @@ TEST_F(Csr, MoveToEllIsEquivalentToRef) mtx->move_to(ell_mtx.get()); dmtx->move_to(dell_mtx.get()); - GKO_ASSERT_MTX_NEAR(ell_mtx.get(), dell_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(ell_mtx.get(), dell_mtx.get(), 0); } @@ -572,7 +572,7 @@ TEST_F(Csr, ConvertToSparsityCsrIsEquivalentToRef) mtx->convert_to(sparsity_mtx.get()); dmtx->convert_to(d_sparsity_mtx.get()); - GKO_ASSERT_MTX_NEAR(sparsity_mtx.get(), d_sparsity_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(sparsity_mtx.get(), d_sparsity_mtx.get(), 0); } @@ -585,22 +585,7 @@ TEST_F(Csr, MoveToSparsityCsrIsEquivalentToRef) mtx->move_to(sparsity_mtx.get()); dmtx->move_to(d_sparsity_mtx.get()); - GKO_ASSERT_MTX_NEAR(sparsity_mtx.get(), d_sparsity_mtx.get(), 1e-14); -} - - -TEST_F(Csr, CalculateMaxNnzPerRowIsEquivalentToRef) -{ - set_up_apply_data(std::make_shared()); - gko::size_type max_nnz_per_row; - gko::size_type dmax_nnz_per_row; - - gko::kernels::reference::csr::calculate_max_nnz_per_row(ref, mtx.get(), - &max_nnz_per_row); - gko::kernels::cuda::csr::calculate_max_nnz_per_row(cuda, dmtx.get(), - &dmax_nnz_per_row); - - ASSERT_EQ(max_nnz_per_row, dmax_nnz_per_row); + GKO_ASSERT_MTX_NEAR(sparsity_mtx.get(), d_sparsity_mtx.get(), 0); } @@ -613,7 +598,7 @@ TEST_F(Csr, ConvertToCooIsEquivalentToRef) mtx->convert_to(coo_mtx.get()); dmtx->convert_to(dcoo_mtx.get()); - GKO_ASSERT_MTX_NEAR(coo_mtx.get(), dcoo_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(coo_mtx.get(), dcoo_mtx.get(), 0); } @@ -626,7 +611,7 @@ TEST_F(Csr, MoveToCooIsEquivalentToRef) mtx->move_to(coo_mtx.get()); dmtx->move_to(dcoo_mtx.get()); - GKO_ASSERT_MTX_NEAR(coo_mtx.get(), dcoo_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(coo_mtx.get(), dcoo_mtx.get(), 0); } @@ -639,7 +624,7 @@ TEST_F(Csr, ConvertToSellpIsEquivalentToRef) mtx->convert_to(sellp_mtx.get()); dmtx->convert_to(dsellp_mtx.get()); - GKO_ASSERT_MTX_NEAR(sellp_mtx.get(), dsellp_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(sellp_mtx.get(), dsellp_mtx.get(), 0); } @@ -652,7 +637,7 @@ TEST_F(Csr, MoveToSellpIsEquivalentToRef) mtx->move_to(sellp_mtx.get()); dmtx->move_to(dsellp_mtx.get()); - GKO_ASSERT_MTX_NEAR(sellp_mtx.get(), dsellp_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(sellp_mtx.get(), dsellp_mtx.get(), 0); } @@ -668,31 +653,16 @@ TEST_F(Csr, ConvertsEmptyToSellp) } -TEST_F(Csr, CalculateTotalColsIsEquivalentToRef) -{ - set_up_apply_data(std::make_shared()); - gko::size_type total_cols; - gko::size_type dtotal_cols; - - gko::kernels::reference::csr::calculate_total_cols( - ref, mtx.get(), &total_cols, 2, gko::matrix::default_slice_size); - gko::kernels::cuda::csr::calculate_total_cols( - cuda, dmtx.get(), &dtotal_cols, 2, gko::matrix::default_slice_size); - - ASSERT_EQ(total_cols, dtotal_cols); -} - - TEST_F(Csr, CalculatesNonzerosPerRow) { set_up_apply_data(std::make_shared()); gko::Array row_nnz(ref, mtx->get_size()[0]); gko::Array drow_nnz(cuda, dmtx->get_size()[0]); - gko::kernels::reference::csr::calculate_nonzeros_per_row(ref, mtx.get(), - &row_nnz); - gko::kernels::cuda::csr::calculate_nonzeros_per_row(cuda, dmtx.get(), - &drow_nnz); + gko::kernels::reference::csr::count_nonzeros_per_row(ref, mtx.get(), + row_nnz.get_data()); + gko::kernels::cuda::csr::count_nonzeros_per_row(cuda, dmtx.get(), + drow_nnz.get_data()); GKO_ASSERT_ARRAY_EQ(row_nnz, drow_nnz); } @@ -710,7 +680,7 @@ TEST_F(Csr, ConvertToHybridIsEquivalentToRef) mtx->convert_to(hybrid_mtx.get()); dmtx->convert_to(dhybrid_mtx.get()); - GKO_ASSERT_MTX_NEAR(hybrid_mtx.get(), dhybrid_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(hybrid_mtx.get(), dhybrid_mtx.get(), 0); } @@ -726,7 +696,7 @@ TEST_F(Csr, MoveToHybridIsEquivalentToRef) mtx->move_to(hybrid_mtx.get()); dmtx->move_to(dhybrid_mtx.get()); - GKO_ASSERT_MTX_NEAR(hybrid_mtx.get(), dhybrid_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(hybrid_mtx.get(), dhybrid_mtx.get(), 0); } diff --git a/cuda/test/matrix/dense_kernels.cpp b/cuda/test/matrix/dense_kernels.cpp index 7e7a72f4d71..8b3ab2630f3 100644 --- a/cuda/test/matrix/dense_kernels.cpp +++ b/cuda/test/matrix/dense_kernels.cpp @@ -392,7 +392,7 @@ TEST_F(Dense, ConvertToCooIsEquivalentToRef) ASSERT_EQ(dcoo_mtx->get_num_stored_elements(), coo_mtx->get_num_stored_elements()); - GKO_ASSERT_MTX_NEAR(dcoo_mtx.get(), coo_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(dcoo_mtx.get(), coo_mtx.get(), 0); } @@ -407,7 +407,7 @@ TEST_F(Dense, MoveToCooIsEquivalentToRef) ASSERT_EQ(dcoo_mtx->get_num_stored_elements(), coo_mtx->get_num_stored_elements()); - GKO_ASSERT_MTX_NEAR(dcoo_mtx.get(), coo_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(dcoo_mtx.get(), coo_mtx.get(), 0); } @@ -420,7 +420,7 @@ TEST_F(Dense, ConvertToCsrIsEquivalentToRef) x->convert_to(csr_mtx.get()); dx->convert_to(dcsr_mtx.get()); - GKO_ASSERT_MTX_NEAR(dcsr_mtx.get(), csr_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(dcsr_mtx.get(), csr_mtx.get(), 0); } @@ -433,7 +433,7 @@ TEST_F(Dense, MoveToCsrIsEquivalentToRef) x->move_to(csr_mtx.get()); dx->move_to(dcsr_mtx.get()); - GKO_ASSERT_MTX_NEAR(dcsr_mtx.get(), csr_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(dcsr_mtx.get(), csr_mtx.get(), 0); } @@ -446,7 +446,7 @@ TEST_F(Dense, ConvertToEllIsEquivalentToRef) x->convert_to(ell_mtx.get()); dx->convert_to(dell_mtx.get()); - GKO_ASSERT_MTX_NEAR(dell_mtx.get(), ell_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(dell_mtx.get(), ell_mtx.get(), 0); } @@ -459,7 +459,7 @@ TEST_F(Dense, MoveToEllIsEquivalentToRef) x->move_to(ell_mtx.get()); dx->move_to(dell_mtx.get()); - GKO_ASSERT_MTX_NEAR(dell_mtx.get(), ell_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(dell_mtx.get(), ell_mtx.get(), 0); } @@ -472,7 +472,7 @@ TEST_F(Dense, ConvertToSellpIsEquivalentToRef) x->convert_to(sellp_mtx.get()); dx->convert_to(dsellp_mtx.get()); - GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, 1e-14); + GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, 0); } @@ -485,7 +485,7 @@ TEST_F(Dense, MoveToSellpIsEquivalentToRef) x->move_to(sellp_mtx.get()); dx->move_to(dsellp_mtx.get()); - GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, 1e-14); + GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, 0); } @@ -501,19 +501,6 @@ TEST_F(Dense, ConvertsEmptyToSellp) } -TEST_F(Dense, CountNNZIsEquivalentToRef) -{ - set_up_apply_data(); - gko::size_type nnz; - gko::size_type dnnz; - - gko::kernels::reference::dense::count_nonzeros(ref, x.get(), &nnz); - gko::kernels::cuda::dense::count_nonzeros(cuda, dx.get(), &dnnz); - - ASSERT_EQ(nnz, dnnz); -} - - TEST_F(Dense, CalculateNNZPerRowIsEquivalentToRef) { set_up_apply_data(); @@ -522,10 +509,10 @@ TEST_F(Dense, CalculateNNZPerRowIsEquivalentToRef) gko::Array dnnz_per_row(cuda); dnnz_per_row.resize_and_reset(dx->get_size()[0]); - gko::kernels::reference::dense::calculate_nonzeros_per_row(ref, x.get(), - &nnz_per_row); - gko::kernels::cuda::dense::calculate_nonzeros_per_row(cuda, dx.get(), - &dnnz_per_row); + gko::kernels::reference::dense::count_nonzeros_per_row( + ref, x.get(), nnz_per_row.get_data()); + gko::kernels::cuda::dense::count_nonzeros_per_row(cuda, dx.get(), + dnnz_per_row.get_data()); auto tmp = gko::Array(ref, dnnz_per_row); for (gko::size_type i = 0; i < nnz_per_row.get_num_elems(); i++) { @@ -534,36 +521,21 @@ TEST_F(Dense, CalculateNNZPerRowIsEquivalentToRef) } -TEST_F(Dense, CalculateMaxNNZPerRowIsEquivalentToRef) +TEST_F(Dense, ComputeMaxNNZPerRowIsEquivalentToRef) { set_up_apply_data(); gko::size_type max_nnz; gko::size_type dmax_nnz; - gko::kernels::reference::dense::calculate_max_nnz_per_row(ref, x.get(), - &max_nnz); - gko::kernels::cuda::dense::calculate_max_nnz_per_row(cuda, dx.get(), - &dmax_nnz); + gko::kernels::reference::dense::compute_max_nnz_per_row(ref, x.get(), + max_nnz); + gko::kernels::cuda::dense::compute_max_nnz_per_row(cuda, dx.get(), + dmax_nnz); ASSERT_EQ(max_nnz, dmax_nnz); } -TEST_F(Dense, CalculateTotalColsIsEquivalentToRef) -{ - set_up_apply_data(); - gko::size_type total_cols; - gko::size_type dtotal_cols; - - gko::kernels::reference::dense::calculate_total_cols( - ref, x.get(), &total_cols, 2, gko::matrix::default_slice_size); - gko::kernels::cuda::dense::calculate_total_cols( - cuda, dx.get(), &dtotal_cols, 2, gko::matrix::default_slice_size); - - ASSERT_EQ(total_cols, dtotal_cols); -} - - TEST_F(Dense, IsTransposable) { set_up_apply_data(); diff --git a/cuda/test/matrix/ell_kernels.cpp b/cuda/test/matrix/ell_kernels.cpp index 0eb9feaa8ea..5b35818c0b6 100644 --- a/cuda/test/matrix/ell_kernels.cpp +++ b/cuda/test/matrix/ell_kernels.cpp @@ -536,7 +536,7 @@ TEST_F(Ell, ConvertToDenseIsEquivalentToRef) mtx->convert_to(dense_mtx.get()); dmtx->convert_to(ddense_mtx.get()); - GKO_ASSERT_MTX_NEAR(dense_mtx.get(), ddense_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(dense_mtx.get(), ddense_mtx.get(), 0); } @@ -550,7 +550,7 @@ TEST_F(Ell, ConvertToCsrIsEquivalentToRef) mtx->convert_to(csr_mtx.get()); dmtx->convert_to(dcsr_mtx.get()); - GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), 0); } diff --git a/cuda/test/matrix/fbcsr_kernels.cpp b/cuda/test/matrix/fbcsr_kernels.cpp index 796c17d49ed..0b59a248dd4 100644 --- a/cuda/test/matrix/fbcsr_kernels.cpp +++ b/cuda/test/matrix/fbcsr_kernels.cpp @@ -318,24 +318,6 @@ TYPED_TEST(Fbcsr, ConjTransposeIsEquivalentToRefSortedBS3) } -TYPED_TEST(Fbcsr, MaxNnzPerRowIsEquivalentToRefSortedBS3) -{ - using Mtx = typename TestFixture::Mtx; - using value_type = typename Mtx::value_type; - using index_type = typename Mtx::index_type; - auto rand_cuda = Mtx::create(this->cuda); - rand_cuda->copy_from(gko::lend(this->rsorted_ref)); - gko::size_type ref_max_nnz{}, cuda_max_nnz{}; - - gko::kernels::cuda::fbcsr::calculate_max_nnz_per_row( - this->cuda, rand_cuda.get(), &cuda_max_nnz); - gko::kernels::reference::fbcsr::calculate_max_nnz_per_row( - this->ref, this->rsorted_ref.get(), &ref_max_nnz); - - ASSERT_EQ(ref_max_nnz, cuda_max_nnz); -} - - TYPED_TEST(Fbcsr, RecognizeSortedMatrix) { using Mtx = typename TestFixture::Mtx; diff --git a/cuda/test/matrix/hybrid_kernels.cpp b/cuda/test/matrix/hybrid_kernels.cpp index 74b2e9b2eb8..9dc76c4864d 100644 --- a/cuda/test/matrix/hybrid_kernels.cpp +++ b/cuda/test/matrix/hybrid_kernels.cpp @@ -219,7 +219,7 @@ TEST_F(Hybrid, ConvertToCsrIsEquivalentToRef) mtx->convert_to(csr_mtx.get()); dmtx->convert_to(dcsr_mtx.get()); - GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), 0); } diff --git a/cuda/test/matrix/sellp_kernels.cpp b/cuda/test/matrix/sellp_kernels.cpp index be3451ea712..ad530b9ff1b 100644 --- a/cuda/test/matrix/sellp_kernels.cpp +++ b/cuda/test/matrix/sellp_kernels.cpp @@ -252,7 +252,7 @@ TEST_F(Sellp, ConvertToDenseIsEquivalentToRef) mtx->convert_to(dense_mtx.get()); dmtx->convert_to(ddense_mtx.get()); - GKO_ASSERT_MTX_NEAR(dense_mtx.get(), ddense_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(dense_mtx.get(), ddense_mtx.get(), 0); } @@ -265,7 +265,7 @@ TEST_F(Sellp, ConvertToCsrIsEquivalentToRef) mtx->convert_to(csr_mtx.get()); dmtx->convert_to(dcsr_mtx.get()); - GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), 0); } @@ -295,19 +295,6 @@ TEST_F(Sellp, ConvertEmptyToCsrIsEquivalentToRef) } -TEST_F(Sellp, CountNonzerosIsEquivalentToRef) -{ - set_up_apply_matrix(64); - gko::size_type nnz; - gko::size_type dnnz; - - gko::kernels::reference::sellp::count_nonzeros(ref, mtx.get(), &nnz); - gko::kernels::cuda::sellp::count_nonzeros(cuda, dmtx.get(), &dnnz); - - ASSERT_EQ(nnz, dnnz); -} - - TEST_F(Sellp, ExtractDiagonalIsEquivalentToRef) { set_up_apply_matrix(64); diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp index 282f55d1e70..5de9c242217 100644 --- a/dpcpp/matrix/csr_kernels.dp.cpp +++ b/dpcpp/matrix/csr_kernels.dp.cpp @@ -727,42 +727,6 @@ void abstract_classical_spmv(dim3 grid, dim3 block, } -template -void convert_row_ptrs_to_idxs(size_type num_rows, - const IndexType* __restrict__ ptrs, - IndexType* __restrict__ idxs, - sycl::nd_item<3> item_ct1) -{ - const auto tidx = thread::get_thread_id_flat(item_ct1); - if (tidx < num_rows) { - for (auto i = ptrs[tidx]; i < ptrs[tidx + 1]; i++) { - idxs[i] = tidx; - } - } -} - -GKO_ENABLE_DEFAULT_HOST(convert_row_ptrs_to_idxs, convert_row_ptrs_to_idxs); - - -template -void initialize_zero_dense(size_type num_rows, size_type num_cols, - size_type stride, ValueType* __restrict__ result, - sycl::nd_item<3> item_ct1) -{ - const auto tidx_x = - item_ct1.get_local_id(2) + - item_ct1.get_local_range().get(2) * item_ct1.get_group(2); - const auto tidx_y = - item_ct1.get_local_id(1) + - item_ct1.get_local_range().get(1) * item_ct1.get_group(1); - if (tidx_x < num_cols && tidx_y < num_rows) { - result[tidx_y * stride + tidx_x] = zero(); - } -} - -GKO_ENABLE_DEFAULT_HOST(initialize_zero_dense, initialize_zero_dense); - - template void fill_in_dense(size_type num_rows, const IndexType* __restrict__ row_ptrs, const IndexType* __restrict__ col_idxs, @@ -795,189 +759,6 @@ void calculate_nnz_per_row(size_type num_rows, GKO_ENABLE_DEFAULT_HOST(calculate_nnz_per_row, calculate_nnz_per_row); -void calculate_slice_lengths(size_type num_rows, size_type slice_size, - size_type stride_factor, - const size_type* __restrict__ nnz_per_row, - size_type* __restrict__ slice_lengths, - size_type* __restrict__ slice_sets, - sycl::nd_item<3> item_ct1) -{ - constexpr auto warp_size = config::warp_size; - const auto sliceid = item_ct1.get_group(2); - const auto tid_in_warp = item_ct1.get_local_id(2); - - if (sliceid * slice_size + tid_in_warp < num_rows) { - size_type thread_result = 0; - for (int i = tid_in_warp; i < slice_size; i += warp_size) { - thread_result = - (i + slice_size * sliceid < num_rows) - ? max(thread_result, nnz_per_row[sliceid * slice_size + i]) - : thread_result; - } - - auto warp_tile = group::tiled_partition( - group::this_thread_block(item_ct1)); - auto warp_result = ::gko::kernels::dpcpp::reduce( - warp_tile, thread_result, - [](const size_type& a, const size_type& b) { return max(a, b); }); - - if (tid_in_warp == 0) { - auto slice_length = - ceildiv(warp_result, stride_factor) * stride_factor; - slice_lengths[sliceid] = slice_length; - slice_sets[sliceid] = slice_length; - } - } -} - -GKO_ENABLE_DEFAULT_HOST(calculate_slice_lengths, calculate_slice_lengths); - - -template -void fill_in_sellp(size_type num_rows, size_type slice_size, - const ValueType* __restrict__ source_values, - const IndexType* __restrict__ source_row_ptrs, - const IndexType* __restrict__ source_col_idxs, - size_type* __restrict__ slice_lengths, - size_type* __restrict__ slice_sets, - IndexType* __restrict__ result_col_idxs, - ValueType* __restrict__ result_values, - sycl::nd_item<3> item_ct1) -{ - const auto global_row = thread::get_thread_id_flat(item_ct1); - const auto row = global_row % slice_size; - const auto sliceid = global_row / slice_size; - - if (global_row < num_rows) { - size_type sellp_ind = slice_sets[sliceid] * slice_size + row; - - for (size_type csr_ind = source_row_ptrs[global_row]; - csr_ind < source_row_ptrs[global_row + 1]; csr_ind++) { - result_values[sellp_ind] = source_values[csr_ind]; - result_col_idxs[sellp_ind] = source_col_idxs[csr_ind]; - sellp_ind += slice_size; - } - for (size_type i = sellp_ind; - i < - (slice_sets[sliceid] + slice_lengths[sliceid]) * slice_size + row; - i += slice_size) { - result_col_idxs[i] = 0; - result_values[i] = zero(); - } - } -} - -GKO_ENABLE_DEFAULT_HOST(fill_in_sellp, fill_in_sellp); - - -template -void initialize_zero_ell(size_type max_nnz_per_row, size_type stride, - ValueType* __restrict__ values, - IndexType* __restrict__ col_idxs, - sycl::nd_item<3> item_ct1) -{ - const auto tidx = thread::get_thread_id_flat(item_ct1); - - if (tidx < stride * max_nnz_per_row) { - values[tidx] = zero(); - col_idxs[tidx] = 0; - } -} - -GKO_ENABLE_DEFAULT_HOST(initialize_zero_ell, initialize_zero_ell); - - -template -void fill_in_ell(size_type num_rows, size_type stride, - const ValueType* __restrict__ source_values, - const IndexType* __restrict__ source_row_ptrs, - const IndexType* __restrict__ source_col_idxs, - ValueType* __restrict__ result_values, - IndexType* __restrict__ result_col_idxs, - sycl::nd_item<3> item_ct1) -{ - constexpr auto warp_size = config::warp_size; - const auto row = thread::get_subwarp_id_flat(item_ct1); - const auto local_tidx = item_ct1.get_local_id(2) % warp_size; - - if (row < num_rows) { - for (size_type i = local_tidx; - i < source_row_ptrs[row + 1] - source_row_ptrs[row]; - i += warp_size) { - const auto result_idx = row + stride * i; - const auto source_idx = i + source_row_ptrs[row]; - result_values[result_idx] = source_values[source_idx]; - result_col_idxs[result_idx] = source_col_idxs[source_idx]; - } - } -} - -GKO_ENABLE_DEFAULT_HOST(fill_in_ell, fill_in_ell); - - -void reduce_max_nnz_per_slice(size_type num_rows, size_type slice_size, - size_type stride_factor, - const size_type* __restrict__ nnz_per_row, - size_type* __restrict__ result, - sycl::nd_item<3> item_ct1) -{ - constexpr auto warp_size = config::warp_size; - auto warp_tile = - group::tiled_partition(group::this_thread_block(item_ct1)); - const auto warpid = thread::get_subwarp_id_flat(item_ct1); - const auto tid_in_warp = warp_tile.thread_rank(); - const auto slice_num = ceildiv(num_rows, slice_size); - - size_type thread_result = 0; - for (auto i = tid_in_warp; i < slice_size; i += warp_size) { - if (warpid * slice_size + i < num_rows) { - thread_result = - max(thread_result, nnz_per_row[warpid * slice_size + i]); - } - } - auto warp_result = ::gko::kernels::dpcpp::reduce( - warp_tile, thread_result, - [](const size_type& a, const size_type& b) { return max(a, b); }); - - if (tid_in_warp == 0 && warpid < slice_num) { - result[warpid] = ceildiv(warp_result, stride_factor) * stride_factor; - } -} - -GKO_ENABLE_DEFAULT_HOST(reduce_max_nnz_per_slice, reduce_max_nnz_per_slice); - - -void reduce_total_cols(size_type num_slices, - const size_type* __restrict__ max_nnz_per_slice, - size_type* __restrict__ result, - sycl::nd_item<3> item_ct1, size_type* block_result) -{ - reduce_array(num_slices, max_nnz_per_slice, block_result, item_ct1, - [](const size_type& x, const size_type& y) { return x + y; }); - - if (item_ct1.get_local_id(2) == 0) { - result[item_ct1.get_group(2)] = block_result[0]; - } -} - -void reduce_total_cols(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, size_type num_slices, - const size_type* max_nnz_per_slice, size_type* result) -{ - queue->submit([&](sycl::handler& cgh) { - sycl::accessor - block_result_acc_ct1(sycl::range<1>(default_block_size), cgh); - - cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - reduce_total_cols(num_slices, max_nnz_per_slice, result, - item_ct1, block_result_acc_ct1.get_pointer()); - }); - }); -} - - void reduce_max_nnz(size_type size, const size_type* __restrict__ nnz_per_row, size_type* __restrict__ result, sycl::nd_item<3> item_ct1, size_type* block_max) @@ -1009,66 +790,6 @@ void reduce_max_nnz(dim3 grid, dim3 block, size_type dynamic_shared_memory, } -template -void calculate_hybrid_coo_row_nnz(size_type num_rows, - size_type ell_max_nnz_per_row, - IndexType* __restrict__ csr_row_idxs, - size_type* __restrict__ coo_row_nnz, - sycl::nd_item<3> item_ct1) -{ - const auto tidx = thread::get_thread_id_flat(item_ct1); - if (tidx < num_rows) { - const size_type csr_nnz = csr_row_idxs[tidx + 1] - csr_row_idxs[tidx]; - coo_row_nnz[tidx] = - (csr_nnz > ell_max_nnz_per_row) * (csr_nnz - ell_max_nnz_per_row); - } -} - -GKO_ENABLE_DEFAULT_HOST(calculate_hybrid_coo_row_nnz, - calculate_hybrid_coo_row_nnz); - - -template -void fill_in_hybrid(size_type num_rows, size_type stride, - size_type ell_max_nnz_per_row, - const ValueType* __restrict__ source_values, - const IndexType* __restrict__ source_row_ptrs, - const IndexType* __restrict__ source_col_idxs, - const size_type* __restrict__ coo_offset, - ValueType* __restrict__ result_ell_val, - IndexType* __restrict__ result_ell_col, - ValueType* __restrict__ result_coo_val, - IndexType* __restrict__ result_coo_col, - IndexType* __restrict__ result_coo_row, - sycl::nd_item<3> item_ct1) -{ - constexpr auto warp_size = config::warp_size; - const auto row = thread::get_subwarp_id_flat(item_ct1); - const auto local_tidx = item_ct1.get_local_id(2) % warp_size; - - if (row < num_rows) { - for (size_type i = local_tidx; - i < source_row_ptrs[row + 1] - source_row_ptrs[row]; - i += warp_size) { - const auto source_idx = i + source_row_ptrs[row]; - if (i < ell_max_nnz_per_row) { - const auto result_idx = row + stride * i; - result_ell_val[result_idx] = source_values[source_idx]; - result_ell_col[result_idx] = source_col_idxs[source_idx]; - } else { - const auto result_idx = - coo_offset[row] + i - ell_max_nnz_per_row; - result_coo_val[result_idx] = source_values[source_idx]; - result_coo_col[result_idx] = source_col_idxs[source_idx]; - result_coo_row[result_idx] = row; - } - } - } -} - -GKO_ENABLE_DEFAULT_HOST(fill_in_hybrid, fill_in_hybrid); - - template void check_unsorted(const IndexType* __restrict__ row_ptrs, const IndexType* __restrict__ col_idxs, IndexType num_rows, @@ -2212,18 +1933,6 @@ void spgeam(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL); -template -void convert_row_ptrs_to_idxs(std::shared_ptr exec, - const IndexType* ptrs, size_type num_rows, - IndexType* idxs) -{ - const auto grid_dim = ceildiv(num_rows, default_block_size); - - kernel::convert_row_ptrs_to_idxs(grid_dim, default_block_size, 0, - exec->get_queue(), num_rows, ptrs, idxs); -} - - template void fill_in_dense(std::shared_ptr exec, const matrix::Csr* source, @@ -2236,14 +1945,6 @@ void fill_in_dense(std::shared_ptr exec, const auto col_idxs = source->get_const_col_idxs(); const auto vals = source->get_const_values(); - const dim3 block_size(config::warp_size, - config::max_block_size / config::warp_size, 1); - const dim3 init_grid_dim(ceildiv(num_cols, block_size.x), - ceildiv(num_rows, block_size.y), 1); - kernel::initialize_zero_dense(init_grid_dim, block_size, 0, - exec->get_queue(), num_rows, num_cols, stride, - result->get_values()); - auto grid_dim = ceildiv(num_rows, default_block_size); kernel::fill_in_dense(grid_dim, default_block_size, 0, exec->get_queue(), num_rows, row_ptrs, col_idxs, vals, stride, @@ -2254,150 +1955,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL); -template -void convert_to_sellp(std::shared_ptr exec, - const matrix::Csr* source, - matrix::Sellp* result) -{ - const auto num_rows = result->get_size()[0]; - const auto num_cols = result->get_size()[1]; - - auto result_values = result->get_values(); - auto result_col_idxs = result->get_col_idxs(); - auto slice_lengths = result->get_slice_lengths(); - auto slice_sets = result->get_slice_sets(); - - const auto slice_size = (result->get_slice_size() == 0) - ? matrix::default_slice_size - : result->get_slice_size(); - const auto stride_factor = (result->get_stride_factor() == 0) - ? matrix::default_stride_factor - : result->get_stride_factor(); - const int slice_num = ceildiv(num_rows, slice_size); - - const auto source_values = source->get_const_values(); - const auto source_row_ptrs = source->get_const_row_ptrs(); - const auto source_col_idxs = source->get_const_col_idxs(); - - auto nnz_per_row = Array(exec, num_rows); - auto grid_dim = ceildiv(num_rows, default_block_size); - - if (grid_dim > 0) { - kernel::calculate_nnz_per_row(grid_dim, default_block_size, 0, - exec->get_queue(), num_rows, - source_row_ptrs, nnz_per_row.get_data()); - } - - grid_dim = slice_num; - - if (grid_dim > 0) { - kernel::calculate_slice_lengths( - grid_dim, config::warp_size, 0, exec->get_queue(), num_rows, - slice_size, stride_factor, nnz_per_row.get_const_data(), - slice_lengths, slice_sets); - } - - components::prefix_sum(exec, slice_sets, slice_num + 1); - - grid_dim = ceildiv(num_rows, default_block_size); - if (grid_dim > 0) { - kernel::fill_in_sellp( - grid_dim, default_block_size, 0, exec->get_queue(), num_rows, - slice_size, source_values, source_row_ptrs, source_col_idxs, - slice_lengths, slice_sets, result_col_idxs, result_values); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CONVERT_TO_SELLP_KERNEL); - - -template -void convert_to_ell(std::shared_ptr exec, - const matrix::Csr* source, - matrix::Ell* result) -{ - const auto source_values = source->get_const_values(); - const auto source_row_ptrs = source->get_const_row_ptrs(); - const auto source_col_idxs = source->get_const_col_idxs(); - - auto result_values = result->get_values(); - auto result_col_idxs = result->get_col_idxs(); - const auto stride = result->get_stride(); - const auto max_nnz_per_row = result->get_num_stored_elements_per_row(); - const auto num_rows = result->get_size()[0]; - const auto num_cols = result->get_size()[1]; - - const auto init_grid_dim = - ceildiv(max_nnz_per_row * num_rows, default_block_size); - - kernel::initialize_zero_ell(init_grid_dim, default_block_size, 0, - exec->get_queue(), max_nnz_per_row, stride, - result_values, result_col_idxs); - - const auto grid_dim = - ceildiv(num_rows * config::warp_size, default_block_size); - - kernel::fill_in_ell(grid_dim, default_block_size, 0, exec->get_queue(), - num_rows, stride, source_values, source_row_ptrs, - source_col_idxs, result_values, result_col_idxs); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CONVERT_TO_ELL_KERNEL); - - -template -void calculate_total_cols(std::shared_ptr exec, - const matrix::Csr* source, - size_type* result, size_type stride_factor, - size_type slice_size) -{ - const auto num_rows = source->get_size()[0]; - - if (num_rows == 0) { - *result = 0; - return; - } - - const auto slice_num = ceildiv(num_rows, slice_size); - const auto row_ptrs = source->get_const_row_ptrs(); - - auto nnz_per_row = Array(exec, num_rows); - auto grid_dim = ceildiv(num_rows, default_block_size); - - kernel::calculate_nnz_per_row(grid_dim, default_block_size, 0, - exec->get_queue(), num_rows, row_ptrs, - nnz_per_row.get_data()); - - grid_dim = ceildiv(slice_num * config::warp_size, default_block_size); - auto max_nnz_per_slice = Array(exec, slice_num); - - kernel::reduce_max_nnz_per_slice( - grid_dim, default_block_size, 0, exec->get_queue(), num_rows, - slice_size, stride_factor, nnz_per_row.get_const_data(), - max_nnz_per_slice.get_data()); - - grid_dim = ceildiv(slice_num, default_block_size); - auto block_results = Array(exec, grid_dim); - - kernel::reduce_total_cols( - grid_dim, default_block_size, 0, exec->get_queue(), slice_num, - max_nnz_per_slice.get_const_data(), block_results.get_data()); - - auto d_result = Array(exec, 1); - - kernel::reduce_total_cols(1, default_block_size, 0, exec->get_queue(), - grid_dim, block_results.get_const_data(), - d_result.get_data()); - - *result = exec->copy_val_to_host(d_result.get_const_data()); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CALCULATE_TOTAL_COLS_KERNEL); - - template void generic_transpose(std::shared_ptr exec, const matrix::Csr* orig, @@ -2547,98 +2104,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL); -template -void calculate_max_nnz_per_row(std::shared_ptr exec, - const matrix::Csr* source, - size_type* result) -{ - const auto num_rows = source->get_size()[0]; - - auto nnz_per_row = Array(exec, num_rows); - auto block_results = Array(exec, default_block_size); - auto d_result = Array(exec, 1); - - const auto grid_dim = ceildiv(num_rows, default_block_size); - kernel::calculate_nnz_per_row( - grid_dim, default_block_size, 0, exec->get_queue(), num_rows, - source->get_const_row_ptrs(), nnz_per_row.get_data()); - - const auto n = ceildiv(num_rows, default_block_size); - const auto reduce_dim = n <= default_block_size ? n : default_block_size; - kernel::reduce_max_nnz(reduce_dim, default_block_size, 0, exec->get_queue(), - num_rows, nnz_per_row.get_const_data(), - block_results.get_data()); - - kernel::reduce_max_nnz(1, default_block_size, 0, exec->get_queue(), - reduce_dim, block_results.get_const_data(), - d_result.get_data()); - - *result = exec->copy_val_to_host(d_result.get_const_data()); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL); - - -template -void convert_to_hybrid(std::shared_ptr exec, - const matrix::Csr* source, - matrix::Hybrid* result) -{ - auto ell_val = result->get_ell_values(); - auto ell_col = result->get_ell_col_idxs(); - auto coo_val = result->get_coo_values(); - auto coo_col = result->get_coo_col_idxs(); - auto coo_row = result->get_coo_row_idxs(); - const auto stride = result->get_ell_stride(); - const auto max_nnz_per_row = result->get_ell_num_stored_elements_per_row(); - const auto num_rows = result->get_size()[0]; - const auto coo_num_stored_elements = result->get_coo_num_stored_elements(); - auto grid_dim = ceildiv(max_nnz_per_row * num_rows, default_block_size); - - kernel::initialize_zero_ell(grid_dim, default_block_size, 0, - exec->get_queue(), max_nnz_per_row, stride, - ell_val, ell_col); - - grid_dim = ceildiv(num_rows, default_block_size); - auto coo_offset = Array(exec, num_rows); - kernel::calculate_hybrid_coo_row_nnz( - grid_dim, default_block_size, 0, exec->get_queue(), num_rows, - max_nnz_per_row, source->get_const_row_ptrs(), coo_offset.get_data()); - - components::prefix_sum(exec, coo_offset.get_data(), num_rows); - - grid_dim = ceildiv(num_rows * config::warp_size, default_block_size); - kernel::fill_in_hybrid( - grid_dim, default_block_size, 0, exec->get_queue(), num_rows, stride, - max_nnz_per_row, source->get_const_values(), - source->get_const_row_ptrs(), source->get_const_col_idxs(), - coo_offset.get_const_data(), ell_val, ell_col, coo_val, coo_col, - coo_row); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL); - - -template -void calculate_nonzeros_per_row(std::shared_ptr exec, - const matrix::Csr* source, - Array* result) -{ - const auto num_rows = source->get_size()[0]; - auto row_ptrs = source->get_const_row_ptrs(); - auto grid_dim = ceildiv(num_rows, default_block_size); - - kernel::calculate_nnz_per_row(grid_dim, default_block_size, 0, - exec->get_queue(), num_rows, row_ptrs, - result->get_data()); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CALCULATE_NONZEROS_PER_ROW_KERNEL); - - template void sort_by_column_index(std::shared_ptr exec, matrix::Csr* to_sort) diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp index a63b28ebdbe..e511ddcbe2d 100644 --- a/dpcpp/matrix/dense_kernels.dp.cpp +++ b/dpcpp/matrix/dense_kernels.dp.cpp @@ -97,7 +97,7 @@ void fill_in_coo(size_type num_rows, size_type num_cols, size_type stride, size_type write_to = row_ptrs[tidx]; for (size_type i = 0; i < num_cols; i++) { - if (source[stride * tidx + i] != zero()) { + if (is_nonzero(source[stride * tidx + i])) { values[write_to] = source[stride * tidx + i]; col_idxs[write_to] = i; row_idxs[write_to] = tidx; @@ -124,7 +124,7 @@ void count_nnz_per_row(size_type num_rows, size_type num_cols, size_type stride, if (row_idx < num_rows) { IndexType part_result{}; for (auto i = warp_tile.thread_rank(); i < num_cols; i += sg_size) { - if (work[stride * row_idx + i] != zero()) { + if (is_nonzero(work[stride * row_idx + i])) { part_result += 1; } } @@ -152,7 +152,7 @@ void fill_in_csr(size_type num_rows, size_type num_cols, size_type stride, if (tidx < num_rows) { auto write_to = row_ptrs[tidx]; for (size_type i = 0; i < num_cols; i++) { - if (source[stride * tidx + i] != zero()) { + if (is_nonzero(source[stride * tidx + i])) { values[write_to] = source[stride * tidx + i]; col_idxs[write_to] = i; write_to++; @@ -175,7 +175,7 @@ void fill_in_ell(size_type num_rows, size_type num_cols, if (tidx < num_rows) { IndexType col_idx = 0; for (size_type col = 0; col < num_cols; col++) { - if (source[tidx * source_stride + col] != zero()) { + if (is_nonzero(source[tidx * source_stride + col])) { col_ptrs[col_idx * result_stride + tidx] = col; values[col_idx * result_stride + tidx] = source[tidx * source_stride + col]; @@ -254,7 +254,7 @@ void fill_in_sellp(size_type num_rows, size_type num_cols, size_type slice_size, for (size_type col = 0; col < num_cols; col++) { auto val = source[global_row * stride + col]; - if (val != zero()) { + if (is_nonzero(val)) { col_idxs[sellp_ind] = col; vals[sellp_ind] = val; sellp_ind += slice_size; @@ -537,6 +537,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL); template void convert_to_coo(std::shared_ptr exec, const matrix::Dense* source, + const int64* row_ptrs, matrix::Coo* result) { auto num_rows = result->get_size()[0]; @@ -548,11 +549,6 @@ void convert_to_coo(std::shared_ptr exec, auto stride = source->get_stride(); - auto nnz_prefix_sum = Array(exec, num_rows); - calculate_nonzeros_per_row(exec, source, &nnz_prefix_sum); - - components::prefix_sum(exec, nnz_prefix_sum.get_data(), num_rows); - auto queue = exec->get_queue(); constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); const std::uint32_t cfg = @@ -565,8 +561,8 @@ void convert_to_coo(std::shared_ptr exec, size_type grid_dim = ceildiv(num_rows, wg_size); kernel::fill_in_coo(grid_dim, wg_size, 0, exec->get_queue(), num_rows, - num_cols, stride, nnz_prefix_sum.get_const_data(), - source->get_const_values(), row_idxs, col_idxs, values); + num_cols, stride, row_ptrs, source->get_const_values(), + row_idxs, col_idxs, values); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -597,15 +593,6 @@ void convert_to_csr(std::shared_ptr exec, auto stride = source->get_stride(); - const auto rows_per_block = ceildiv(wg_size, sg_size); - const auto grid_dim_nnz = ceildiv(source->get_size()[0], rows_per_block); - - kernel::count_nnz_per_row_call( - cfg, grid_dim_nnz, wg_size, 0, exec->get_queue(), num_rows, num_cols, - stride, source->get_const_values(), row_ptrs); - - components::prefix_sum(exec, row_ptrs, num_rows + 1); - size_type grid_dim = ceildiv(num_rows, wg_size); kernel::fill_in_csr(grid_dim, default_block_size, 0, exec->get_queue(), @@ -694,7 +681,7 @@ void convert_to_sellp(std::shared_ptr exec, const int slice_num = ceildiv(num_rows, slice_size); auto nnz_per_row = Array(exec, num_rows); - calculate_nonzeros_per_row(exec, source, &nnz_per_row); + count_nonzeros_per_row(exec, source, nnz_per_row.get_data()); auto grid_dim = slice_num; @@ -730,149 +717,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DENSE_CONVERT_TO_SPARSITY_CSR_KERNEL); -template -void count_nonzeros(std::shared_ptr exec, - const matrix::Dense* source, size_type* result) -{ - const auto num_rows = source->get_size()[0]; - auto nnz_per_row = Array(exec, num_rows); - - calculate_nonzeros_per_row(exec, source, &nnz_per_row); - - *result = reduce_add_array(exec, num_rows, nnz_per_row.get_const_data()); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COUNT_NONZEROS_KERNEL); - - -template -void calculate_max_nnz_per_row(std::shared_ptr exec, - const matrix::Dense* source, - size_type* result) -{ - const auto num_rows = source->get_size()[0]; - auto nnz_per_row = Array(exec, num_rows); - - calculate_nonzeros_per_row(exec, source, &nnz_per_row); - auto queue = exec->get_queue(); - constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); - const std::uint32_t cfg = - get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { - return validate(queue, KCFG_1D::decode<0>(cfg), - KCFG_1D::decode<1>(cfg)); - }); - const auto wg_size = KCFG_1D::decode<0>(cfg); - const auto n = ceildiv(num_rows, wg_size); - const size_type grid_dim = (n <= wg_size) ? n : wg_size; - - auto block_results = Array(exec, grid_dim); - - kernel::reduce_max_nnz_call( - cfg, grid_dim, wg_size, wg_size * sizeof(size_type), exec->get_queue(), - num_rows, nnz_per_row.get_const_data(), block_results.get_data()); - - auto d_result = Array(exec, 1); - - kernel::reduce_max_nnz_call( - cfg, 1, wg_size, wg_size * sizeof(size_type), exec->get_queue(), - grid_dim, block_results.get_const_data(), d_result.get_data()); - - *result = exec->copy_val_to_host(d_result.get_const_data()); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_CALCULATE_MAX_NNZ_PER_ROW_KERNEL); - - -template -void calculate_nonzeros_per_row(std::shared_ptr exec, - const matrix::Dense* source, - Array* result) -{ - auto queue = exec->get_queue(); - constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); - const std::uint32_t cfg = - get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { - return validate(queue, KCFG_1D::decode<0>(cfg), - KCFG_1D::decode<1>(cfg)); - }); - const auto wg_size = KCFG_1D::decode<0>(cfg); - const auto sg_size = KCFG_1D::decode<1>(cfg); - const dim3 block_size(wg_size, 1, 1); - auto rows_per_block = ceildiv(wg_size, sg_size); - const size_t grid_x = ceildiv(source->get_size()[0], rows_per_block); - const dim3 grid_size(grid_x, 1, 1); - if (grid_x > 0) { - kernel::count_nnz_per_row_call( - cfg, grid_size, block_size, 0, exec->get_queue(), - source->get_size()[0], source->get_size()[1], source->get_stride(), - source->get_const_values(), result->get_data()); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_CALCULATE_NONZEROS_PER_ROW_KERNEL); - - -template -void calculate_total_cols(std::shared_ptr exec, - const matrix::Dense* source, - size_type* result, size_type stride_factor, - size_type slice_size) -{ - const auto num_rows = source->get_size()[0]; - - if (num_rows == 0) { - *result = 0; - return; - } - - const auto num_cols = source->get_size()[1]; - const auto slice_num = ceildiv(num_rows, slice_size); - - auto nnz_per_row = Array(exec, num_rows); - - calculate_nonzeros_per_row(exec, source, &nnz_per_row); - - auto max_nnz_per_slice = Array(exec, slice_num); - auto queue = exec->get_queue(); - constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); - const std::uint32_t cfg = - get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { - return validate(queue, KCFG_1D::decode<0>(cfg), - KCFG_1D::decode<1>(cfg)); - }); - const auto wg_size = KCFG_1D::decode<0>(cfg); - const auto sg_size = KCFG_1D::decode<1>(cfg); - - auto grid_dim = ceildiv(slice_num * sg_size, wg_size); - - kernel::reduce_max_nnz_per_slice_call( - cfg, grid_dim, wg_size, 0, exec->get_queue(), num_rows, slice_size, - stride_factor, nnz_per_row.get_const_data(), - max_nnz_per_slice.get_data()); - - grid_dim = ceildiv(slice_num, wg_size); - auto block_results = Array(exec, grid_dim); - - kernel::reduce_total_cols_call( - cfg, grid_dim, wg_size, wg_size * sizeof(size_type), exec->get_queue(), - slice_num, max_nnz_per_slice.get_const_data(), - block_results.get_data()); - - auto d_result = Array(exec, 1); - - kernel::reduce_total_cols_call( - cfg, 1, wg_size, wg_size * sizeof(size_type), exec->get_queue(), - grid_dim, block_results.get_const_data(), d_result.get_data()); - - *result = exec->copy_val_to_host(d_result.get_const_data()); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_CALCULATE_TOTAL_COLS_KERNEL); - - template void transpose(std::shared_ptr exec, const matrix::Dense* orig, diff --git a/dpcpp/matrix/ell_kernels.dp.cpp b/dpcpp/matrix/ell_kernels.dp.cpp index bed34ccb051..fbd1063bf13 100644 --- a/dpcpp/matrix/ell_kernels.dp.cpp +++ b/dpcpp/matrix/ell_kernels.dp.cpp @@ -298,107 +298,6 @@ void spmv(dim3 grid, dim3 block, size_type dynamic_shared_memory, } // namespace - - -template -void count_nnz_per_row(size_type num_rows, size_type max_nnz_per_row, - size_type stride, const ValueType* __restrict__ values, - IndexType* __restrict__ result, - sycl::nd_item<3> item_ct1) -{ - constexpr auto warp_size = config::warp_size; - const auto row_idx = thread::get_subwarp_id_flat(item_ct1); - auto warp_tile = - group::tiled_partition(group::this_thread_block(item_ct1)); - - if (row_idx < num_rows) { - IndexType part_result{}; - for (auto i = warp_tile.thread_rank(); i < max_nnz_per_row; - i += warp_size) { - if (values[stride * i + row_idx] != zero()) { - part_result += 1; - } - } - result[row_idx] = ::gko::kernels::dpcpp::reduce( - warp_tile, part_result, - [](const size_type& a, const size_type& b) { return a + b; }); - } -} - -template -void count_nnz_per_row(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, size_type num_rows, - size_type max_nnz_per_row, size_type stride, - const ValueType* values, IndexType* result) -{ - queue->submit([&](sycl::handler& cgh) { - cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - count_nnz_per_row(num_rows, max_nnz_per_row, stride, values, - result, item_ct1); - }); - }); -} - -#define GKO_ELL_COUNT_NNZ_PER_ROW(ValueType, IndexType) \ - void count_nnz_per_row(dim3, dim3, size_type, sycl::queue*, size_type, \ - size_type, size_type, const ValueType*, IndexType*) - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_ELL_COUNT_NNZ_PER_ROW); - -#undef GKO_ELL_COUNT_NNZ_PER_ROW - - -template -void fill_in_csr(size_type num_rows, size_type max_nnz_per_row, - size_type stride, const ValueType* __restrict__ source_values, - const IndexType* __restrict__ source_col_idxs, - IndexType* __restrict__ result_row_ptrs, - IndexType* __restrict__ result_col_idxs, - ValueType* __restrict__ result_values, - sycl::nd_item<3> item_ct1) -{ - const auto tidx = thread::get_thread_id_flat(item_ct1); - - if (tidx < num_rows) { - auto write_to = result_row_ptrs[tidx]; - for (size_type i = 0; i < max_nnz_per_row; i++) { - const auto source_idx = tidx + stride * i; - if (source_values[source_idx] != zero()) { - result_values[write_to] = source_values[source_idx]; - result_col_idxs[write_to] = source_col_idxs[source_idx]; - write_to++; - } - } - } -} - -GKO_ENABLE_DEFAULT_HOST(fill_in_csr, fill_in_csr); - - -template -void extract_diagonal(size_type diag_size, size_type max_nnz_per_row, - size_type orig_stride, - const ValueType* __restrict__ orig_values, - const IndexType* __restrict__ orig_col_idxs, - ValueType* __restrict__ diag, sycl::nd_item<3> item_ct1) -{ - const auto tidx = thread::get_thread_id_flat(item_ct1); - const auto row = tidx % diag_size; - const auto col = tidx / diag_size; - const auto ell_ind = orig_stride * col + row; - - if (col < max_nnz_per_row) { - if (orig_col_idxs[ell_ind] == row && - orig_values[ell_ind] != zero()) { - diag[row] = orig_values[ell_ind]; - } - } -} - -GKO_ENABLE_DEFAULT_HOST(extract_diagonal, extract_diagonal); - - } // namespace kernel @@ -579,30 +478,6 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL); -template -void extract_diagonal(std::shared_ptr exec, - const matrix::Ell* orig, - matrix::Diagonal* diag) -{ - const auto max_nnz_per_row = orig->get_num_stored_elements_per_row(); - const auto orig_stride = orig->get_stride(); - const auto diag_size = diag->get_size()[0]; - const auto num_blocks = - ceildiv(diag_size * max_nnz_per_row, default_block_size); - - const auto orig_values = orig->get_const_values(); - const auto orig_col_idxs = orig->get_const_col_idxs(); - auto diag_values = diag->get_values(); - - kernel::extract_diagonal( - num_blocks, default_block_size, 0, exec->get_queue(), diag_size, - max_nnz_per_row, orig_stride, orig_values, orig_col_idxs, diag_values); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_ELL_EXTRACT_DIAGONAL_KERNEL); - - } // namespace ell } // namespace dpcpp } // namespace kernels diff --git a/dpcpp/matrix/fbcsr_kernels.dp.cpp b/dpcpp/matrix/fbcsr_kernels.dp.cpp index d1d1707ea18..22f971466ad 100644 --- a/dpcpp/matrix/fbcsr_kernels.dp.cpp +++ b/dpcpp/matrix/fbcsr_kernels.dp.cpp @@ -89,12 +89,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_FBCSR_FILL_IN_MATRIX_DATA_KERNEL); -template -void convert_row_ptrs_to_idxs(std::shared_ptr exec, - const IndexType* ptrs, size_type num_rows, - IndexType* idxs) GKO_NOT_IMPLEMENTED; - - template void fill_in_dense(std::shared_ptr exec, const matrix::Fbcsr* source, @@ -133,26 +127,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_FBCSR_CONJ_TRANSPOSE_KERNEL); -template -void calculate_max_nnz_per_row( - std::shared_ptr exec, - const matrix::Fbcsr* source, - size_type* result) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_FBCSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL); - - -template -void calculate_nonzeros_per_row( - std::shared_ptr exec, - const matrix::Fbcsr* source, - Array* result) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_FBCSR_CALCULATE_NONZEROS_PER_ROW_KERNEL); - - template void is_sorted_by_column_index( std::shared_ptr exec, diff --git a/dpcpp/matrix/hybrid_kernels.dp.cpp b/dpcpp/matrix/hybrid_kernels.dp.cpp index 99509cfe2fc..80a79dee850 100644 --- a/dpcpp/matrix/hybrid_kernels.dp.cpp +++ b/dpcpp/matrix/hybrid_kernels.dp.cpp @@ -37,22 +37,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include - - -#include "core/components/fill_array_kernels.hpp" -#include "core/components/prefix_sum_kernels.hpp" -#include "core/matrix/coo_kernels.hpp" -#include "core/matrix/ell_kernels.hpp" -#include "dpcpp/base/config.hpp" -#include "dpcpp/base/dim3.dp.hpp" -#include "dpcpp/base/helper.hpp" -#include "dpcpp/components/atomic.dp.hpp" -#include "dpcpp/components/cooperative_groups.dp.hpp" -#include "dpcpp/components/format_conversion.dp.hpp" -#include "dpcpp/components/reduction.dp.hpp" -#include "dpcpp/components/segment_scan.dp.hpp" -#include "dpcpp/components/thread_ids.dp.hpp" namespace gko { @@ -66,145 +50,6 @@ namespace dpcpp { namespace hybrid { -constexpr int default_block_size = 256; -constexpr int warps_in_block = 4; - - -namespace kernel { - - -/** - * The global function for counting the number of nonzeros per row of COO. - * It is almost like COO spmv routine. - * It performs is_nonzeros(Coo) times the vector whose values are one - * - * @param nnz the number of nonzeros in the matrix - * @param num_line the maximum round of each warp - * @param val the value array of the matrix - * @param row the row index array of the matrix - * @param nnz_per_row the output nonzeros per row - */ -template -void count_coo_row_nnz(const size_type nnz, const size_type num_lines, - const ValueType* __restrict__ val, - const IndexType* __restrict__ row, - IndexType* __restrict__ nnz_per_row, - sycl::nd_item<3> item_ct1) -{ - IndexType temp_val = 0; - const auto start = - static_cast(item_ct1.get_local_range().get(2)) * - item_ct1.get_group(2) * item_ct1.get_local_range().get(1) * - num_lines + - item_ct1.get_local_id(1) * item_ct1.get_local_range().get(2) * - num_lines; - size_type num = (nnz > start) * ceildiv(nnz - start, subgroup_size); - num = min(num, num_lines); - const IndexType ind_start = start + item_ct1.get_local_id(2); - const IndexType ind_end = ind_start + (num - 1) * subgroup_size; - IndexType ind = ind_start; - IndexType curr_row = (ind < nnz) ? row[ind] : 0; - const auto tile_block = group::tiled_partition( - group::this_thread_block(item_ct1)); - for (; ind < ind_end; ind += subgroup_size) { - temp_val += ind < nnz && val[ind] != zero(); - auto next_row = (ind + subgroup_size < nnz) ? row[ind + subgroup_size] - : row[nnz - 1]; - // segmented scan - if (tile_block.any(curr_row != next_row)) { - bool is_first_in_segment = - segment_scan(tile_block, curr_row, &temp_val); - if (is_first_in_segment) { - atomic_add(&(nnz_per_row[curr_row]), temp_val); - } - temp_val = 0; - } - curr_row = next_row; - } - if (num > 0) { - ind = ind_end; - temp_val += ind < nnz && val[ind] != zero(); - // segmented scan - - bool is_first_in_segment = - segment_scan(tile_block, curr_row, &temp_val); - if (is_first_in_segment) { - atomic_add(&(nnz_per_row[curr_row]), temp_val); - } - } -} - -template -void count_coo_row_nnz(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, const size_type nnz, - const size_type num_lines, const ValueType* val, - const IndexType* row, IndexType* nnz_per_row) -{ - queue->submit([&](sycl::handler& cgh) { - cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - count_coo_row_nnz(nnz, num_lines, val, row, - nnz_per_row, item_ct1); - }); - }); -} - - -template -void fill_in_csr(size_type num_rows, size_type max_nnz_per_row, - size_type stride, const ValueType* __restrict__ ell_val, - const IndexType* __restrict__ ell_col, - const ValueType* __restrict__ coo_val, - const IndexType* __restrict__ coo_col, - const IndexType* __restrict__ coo_offset, - IndexType* __restrict__ result_row_ptrs, - IndexType* __restrict__ result_col_idxs, - ValueType* __restrict__ result_values, - sycl::nd_item<3> item_ct1) -{ - const auto tidx = thread::get_thread_id_flat(item_ct1); - - if (tidx < num_rows) { - auto write_to = result_row_ptrs[tidx]; - for (size_type i = 0; i < max_nnz_per_row; i++) { - const auto source_idx = tidx + stride * i; - if (ell_val[source_idx] != zero()) { - result_values[write_to] = ell_val[source_idx]; - result_col_idxs[write_to] = ell_col[source_idx]; - write_to++; - } - } - for (auto i = coo_offset[tidx]; i < coo_offset[tidx + 1]; i++) { - if (coo_val[i] != zero()) { - result_values[write_to] = coo_val[i]; - result_col_idxs[write_to] = coo_col[i]; - write_to++; - } - } - } -} - -GKO_ENABLE_DEFAULT_HOST(fill_in_csr, fill_in_csr); - - -template -void add(size_type num, ValueType1* __restrict__ val1, - const ValueType2* __restrict__ val2, sycl::nd_item<3> item_ct1) -{ - const auto tidx = thread::get_thread_id_flat(item_ct1); - if (tidx < num) { - val1[tidx] += val2[tidx]; - } -} - -GKO_ENABLE_DEFAULT_HOST(add, add); - - -} // namespace kernel - - template void split_matrix_data( std::shared_ptr exec, diff --git a/dpcpp/matrix/sellp_kernels.dp.cpp b/dpcpp/matrix/sellp_kernels.dp.cpp index 8a0961533b6..fa37e6b8e7e 100644 --- a/dpcpp/matrix/sellp_kernels.dp.cpp +++ b/dpcpp/matrix/sellp_kernels.dp.cpp @@ -135,115 +135,6 @@ GKO_ENABLE_DEFAULT_HOST(advanced_spmv_kernel, advanced_spmv_kernel); } // namespace -namespace kernel { - - -template -void count_nnz_per_row(size_type num_rows, size_type slice_size, - const size_type* __restrict__ slice_sets, - const ValueType* __restrict__ values, - IndexType* __restrict__ result, - sycl::nd_item<3> item_ct1) -{ - constexpr auto warp_size = config::warp_size; - auto warp_tile = - group::tiled_partition(group::this_thread_block(item_ct1)); - const auto row_idx = thread::get_subwarp_id_flat(item_ct1); - const auto slice_id = row_idx / slice_size; - const auto tid_in_warp = warp_tile.thread_rank(); - const auto row_in_slice = row_idx % slice_size; - - if (row_idx < num_rows) { - IndexType part_result{}; - for (size_type sellp_ind = - (slice_sets[slice_id] + tid_in_warp) * slice_size + - row_in_slice; - sellp_ind < slice_sets[slice_id + 1] * slice_size; - sellp_ind += warp_size * slice_size) { - if (values[sellp_ind] != zero()) { - part_result += 1; - } - } - result[row_idx] = ::gko::kernels::dpcpp::reduce( - warp_tile, part_result, - [](const size_type& a, const size_type& b) { return a + b; }); - } -} - -GKO_ENABLE_DEFAULT_HOST(count_nnz_per_row, count_nnz_per_row); - - -template -void fill_in_csr(size_type num_rows, size_type slice_size, - const size_type* __restrict__ source_slice_sets, - const IndexType* __restrict__ source_col_idxs, - const ValueType* __restrict__ source_values, - IndexType* __restrict__ result_row_ptrs, - IndexType* __restrict__ result_col_idxs, - ValueType* __restrict__ result_values, - sycl::nd_item<3> item_ct1) -{ - const auto row = thread::get_thread_id_flat(item_ct1); - const auto slice_id = row / slice_size; - const auto row_in_slice = row % slice_size; - - if (row < num_rows) { - size_type csr_ind = result_row_ptrs[row]; - for (size_type sellp_ind = - source_slice_sets[slice_id] * slice_size + row_in_slice; - sellp_ind < source_slice_sets[slice_id + 1] * slice_size; - sellp_ind += slice_size) { - if (source_values[sellp_ind] != zero()) { - result_values[csr_ind] = source_values[sellp_ind]; - result_col_idxs[csr_ind] = source_col_idxs[sellp_ind]; - csr_ind++; - } - } - } -} - -GKO_ENABLE_DEFAULT_HOST(fill_in_csr, fill_in_csr); - - -template -void extract_diagonal(size_type diag_size, size_type slice_size, - const size_type* __restrict__ orig_slice_sets, - const ValueType* __restrict__ orig_values, - const IndexType* __restrict__ orig_col_idxs, - ValueType* __restrict__ diag, sycl::nd_item<3> item_ct1) -{ - constexpr auto warp_size = config::warp_size; - auto warp_tile = - group::tiled_partition(group::this_thread_block(item_ct1)); - const auto slice_id = thread::get_subwarp_id_flat(item_ct1); - const auto tid_in_warp = warp_tile.thread_rank(); - const auto slice_num = ceildiv(diag_size, slice_size); - - if (slice_id >= slice_num) { - return; - } - - const auto start_ind = orig_slice_sets[slice_id] * slice_size + tid_in_warp; - const auto end_ind = orig_slice_sets[slice_id + 1] * slice_size; - - for (auto sellp_ind = start_ind; sellp_ind < end_ind; - sellp_ind += warp_size) { - auto global_row = slice_id * slice_size + sellp_ind % slice_size; - if (global_row < diag_size) { - if (orig_col_idxs[sellp_ind] == global_row && - orig_values[sellp_ind] != zero()) { - diag[global_row] = orig_values[sellp_ind]; - } - } - } -} - -GKO_ENABLE_DEFAULT_HOST(extract_diagonal, extract_diagonal); - - -} // namespace kernel - - template void spmv(std::shared_ptr exec, const matrix::Sellp* a, @@ -288,107 +179,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_SELLP_ADVANCED_SPMV_KERNEL); -template -void convert_to_csr(std::shared_ptr exec, - const matrix::Sellp* source, - matrix::Csr* result) -{ - const auto num_rows = source->get_size()[0]; - const auto slice_size = source->get_slice_size(); - const auto slice_num = ceildiv(num_rows, slice_size); - - const auto source_values = source->get_const_values(); - const auto source_slice_lengths = source->get_const_slice_lengths(); - const auto source_slice_sets = source->get_const_slice_sets(); - const auto source_col_idxs = source->get_const_col_idxs(); - - auto result_values = result->get_values(); - auto result_col_idxs = result->get_col_idxs(); - auto result_row_ptrs = result->get_row_ptrs(); - - auto grid_dim = ceildiv(num_rows * config::warp_size, default_block_size); - - if (grid_dim > 0) { - kernel::count_nnz_per_row( - grid_dim, default_block_size, 0, exec->get_queue(), num_rows, - slice_size, source_slice_sets, source_values, result_row_ptrs); - } - - grid_dim = ceildiv(num_rows + 1, default_block_size); - auto add_values = Array(exec, grid_dim); - - components::prefix_sum(exec, result_row_ptrs, num_rows + 1); - - grid_dim = ceildiv(num_rows, default_block_size); - - if (grid_dim > 0) { - kernel::fill_in_csr(grid_dim, default_block_size, 0, exec->get_queue(), - num_rows, slice_size, source_slice_sets, - source_col_idxs, source_values, result_row_ptrs, - result_col_idxs, result_values); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_SELLP_CONVERT_TO_CSR_KERNEL); - - -template -void count_nonzeros(std::shared_ptr exec, - const matrix::Sellp* source, - size_type* result) -{ - const auto num_rows = source->get_size()[0]; - - if (num_rows <= 0) { - *result = 0; - return; - } - - const auto slice_size = source->get_slice_size(); - const auto slice_sets = source->get_const_slice_sets(); - const auto values = source->get_const_values(); - - auto nnz_per_row = Array(exec, num_rows); - - auto grid_dim = ceildiv(num_rows * config::warp_size, default_block_size); - - kernel::count_nnz_per_row(grid_dim, default_block_size, 0, - exec->get_queue(), num_rows, slice_size, - slice_sets, values, nnz_per_row.get_data()); - - *result = reduce_add_array(exec, num_rows, nnz_per_row.get_const_data()); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_SELLP_COUNT_NONZEROS_KERNEL); - - -template -void extract_diagonal(std::shared_ptr exec, - const matrix::Sellp* orig, - matrix::Diagonal* diag) -{ - const auto diag_size = diag->get_size()[0]; - const auto slice_size = orig->get_slice_size(); - const auto slice_num = ceildiv(diag_size, slice_size); - const auto num_blocks = - ceildiv(slice_num * config::warp_size, default_block_size); - - const auto orig_slice_sets = orig->get_const_slice_sets(); - const auto orig_values = orig->get_const_values(); - const auto orig_col_idxs = orig->get_const_col_idxs(); - auto diag_values = diag->get_values(); - - kernel::extract_diagonal( - num_blocks, default_block_size, 0, exec->get_queue(), diag_size, - slice_size, orig_slice_sets, orig_values, orig_col_idxs, diag_values); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_SELLP_EXTRACT_DIAGONAL_KERNEL); - - } // namespace sellp } // namespace dpcpp } // namespace kernels diff --git a/dpcpp/matrix/sparsity_csr_kernels.dp.cpp b/dpcpp/matrix/sparsity_csr_kernels.dp.cpp index d64d7546e4c..81ffcb87cd3 100644 --- a/dpcpp/matrix/sparsity_csr_kernels.dp.cpp +++ b/dpcpp/matrix/sparsity_csr_kernels.dp.cpp @@ -69,6 +69,15 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL); +template +void fill_in_dense(std::shared_ptr exec, + const matrix::SparsityCsr* input, + matrix::Dense* output) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_SPARSITY_CSR_FILL_IN_DENSE_KERNEL); + + template void count_num_diagonal_elements( std::shared_ptr exec, diff --git a/dpcpp/test/matrix/coo_kernels.cpp b/dpcpp/test/matrix/coo_kernels.cpp index f47dc96d40e..5a6dc4e0df0 100644 --- a/dpcpp/test/matrix/coo_kernels.cpp +++ b/dpcpp/test/matrix/coo_kernels.cpp @@ -343,7 +343,7 @@ TEST_F(Coo, ConvertToDenseIsEquivalentToRef) mtx->convert_to(dense_mtx.get()); dmtx->convert_to(ddense_mtx.get()); - GKO_ASSERT_MTX_NEAR(dense_mtx.get(), ddense_mtx.get(), r::value); + GKO_ASSERT_MTX_NEAR(dense_mtx.get(), ddense_mtx.get(), 0); } @@ -358,7 +358,7 @@ TEST_F(Coo, ConvertToCsrIsEquivalentToRef) dense_mtx->convert_to(csr_mtx.get()); dmtx->convert_to(dcsr_mtx.get()); - GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), r::value); + GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), 0); } diff --git a/dpcpp/test/matrix/csr_kernels.cpp b/dpcpp/test/matrix/csr_kernels.cpp index cb460e9f726..44c24dffd2a 100644 --- a/dpcpp/test/matrix/csr_kernels.cpp +++ b/dpcpp/test/matrix/csr_kernels.cpp @@ -521,7 +521,7 @@ TEST_F(Csr, ConvertToDenseIsEquivalentToRef) mtx->convert_to(dense_mtx.get()); dmtx->convert_to(ddense_mtx.get()); - GKO_ASSERT_MTX_NEAR(dense_mtx.get(), ddense_mtx.get(), r::value); + GKO_ASSERT_MTX_NEAR(dense_mtx.get(), ddense_mtx.get(), 0); } @@ -534,7 +534,7 @@ TEST_F(Csr, MoveToDenseIsEquivalentToRef) mtx->move_to(dense_mtx.get()); dmtx->move_to(ddense_mtx.get()); - GKO_ASSERT_MTX_NEAR(dense_mtx.get(), ddense_mtx.get(), r::value); + GKO_ASSERT_MTX_NEAR(dense_mtx.get(), ddense_mtx.get(), 0); } @@ -547,7 +547,7 @@ TEST_F(Csr, ConvertToEllIsEquivalentToRef) mtx->convert_to(ell_mtx.get()); dmtx->convert_to(dell_mtx.get()); - GKO_ASSERT_MTX_NEAR(ell_mtx.get(), dell_mtx.get(), r::value); + GKO_ASSERT_MTX_NEAR(ell_mtx.get(), dell_mtx.get(), 0); } @@ -560,7 +560,7 @@ TEST_F(Csr, MoveToEllIsEquivalentToRef) mtx->move_to(ell_mtx.get()); dmtx->move_to(dell_mtx.get()); - GKO_ASSERT_MTX_NEAR(ell_mtx.get(), dell_mtx.get(), r::value); + GKO_ASSERT_MTX_NEAR(ell_mtx.get(), dell_mtx.get(), 0); } @@ -573,8 +573,7 @@ TEST_F(Csr, ConvertToSparsityCsrIsEquivalentToRef) mtx->convert_to(sparsity_mtx.get()); dmtx->convert_to(d_sparsity_mtx.get()); - GKO_ASSERT_MTX_NEAR(sparsity_mtx.get(), d_sparsity_mtx.get(), - r::value); + GKO_ASSERT_MTX_NEAR(sparsity_mtx.get(), d_sparsity_mtx.get(), 0); } @@ -587,23 +586,7 @@ TEST_F(Csr, MoveToSparsityCsrIsEquivalentToRef) mtx->move_to(sparsity_mtx.get()); dmtx->move_to(d_sparsity_mtx.get()); - GKO_ASSERT_MTX_NEAR(sparsity_mtx.get(), d_sparsity_mtx.get(), - r::value); -} - - -TEST_F(Csr, CalculateMaxNnzPerRowIsEquivalentToRef) -{ - set_up_apply_data(std::make_shared()); - gko::size_type max_nnz_per_row; - gko::size_type dmax_nnz_per_row; - - gko::kernels::reference::csr::calculate_max_nnz_per_row(ref, mtx.get(), - &max_nnz_per_row); - gko::kernels::dpcpp::csr::calculate_max_nnz_per_row(dpcpp, dmtx.get(), - &dmax_nnz_per_row); - - ASSERT_EQ(max_nnz_per_row, dmax_nnz_per_row); + GKO_ASSERT_MTX_NEAR(sparsity_mtx.get(), d_sparsity_mtx.get(), 0); } @@ -616,7 +599,7 @@ TEST_F(Csr, ConvertToCooIsEquivalentToRef) mtx->convert_to(coo_mtx.get()); dmtx->convert_to(dcoo_mtx.get()); - GKO_ASSERT_MTX_NEAR(coo_mtx.get(), dcoo_mtx.get(), r::value); + GKO_ASSERT_MTX_NEAR(coo_mtx.get(), dcoo_mtx.get(), 0); } @@ -629,7 +612,7 @@ TEST_F(Csr, MoveToCooIsEquivalentToRef) mtx->move_to(coo_mtx.get()); dmtx->move_to(dcoo_mtx.get()); - GKO_ASSERT_MTX_NEAR(coo_mtx.get(), dcoo_mtx.get(), r::value); + GKO_ASSERT_MTX_NEAR(coo_mtx.get(), dcoo_mtx.get(), 0); } @@ -642,7 +625,7 @@ TEST_F(Csr, ConvertToSellpIsEquivalentToRef) mtx->convert_to(sellp_mtx.get()); dmtx->convert_to(dsellp_mtx.get()); - GKO_ASSERT_MTX_NEAR(sellp_mtx.get(), dsellp_mtx.get(), r::value); + GKO_ASSERT_MTX_NEAR(sellp_mtx.get(), dsellp_mtx.get(), 0); } @@ -655,7 +638,7 @@ TEST_F(Csr, MoveToSellpIsEquivalentToRef) mtx->move_to(sellp_mtx.get()); dmtx->move_to(dsellp_mtx.get()); - GKO_ASSERT_MTX_NEAR(sellp_mtx.get(), dsellp_mtx.get(), r::value); + GKO_ASSERT_MTX_NEAR(sellp_mtx.get(), dsellp_mtx.get(), 0); } @@ -671,31 +654,16 @@ TEST_F(Csr, ConvertsEmptyToSellp) } -TEST_F(Csr, CalculateTotalColsIsEquivalentToRef) -{ - set_up_apply_data(std::make_shared()); - gko::size_type total_cols; - gko::size_type dtotal_cols; - - gko::kernels::reference::csr::calculate_total_cols( - ref, mtx.get(), &total_cols, 2, gko::matrix::default_slice_size); - gko::kernels::dpcpp::csr::calculate_total_cols( - dpcpp, dmtx.get(), &dtotal_cols, 2, gko::matrix::default_slice_size); - - ASSERT_EQ(total_cols, dtotal_cols); -} - - TEST_F(Csr, CalculatesNonzerosPerRow) { set_up_apply_data(std::make_shared()); gko::Array row_nnz(ref, mtx->get_size()[0]); gko::Array drow_nnz(dpcpp, dmtx->get_size()[0]); - gko::kernels::reference::csr::calculate_nonzeros_per_row(ref, mtx.get(), - &row_nnz); - gko::kernels::dpcpp::csr::calculate_nonzeros_per_row(dpcpp, dmtx.get(), - &drow_nnz); + gko::kernels::reference::csr::count_nonzeros_per_row(ref, mtx.get(), + row_nnz.get_data()); + gko::kernels::dpcpp::csr::count_nonzeros_per_row(dpcpp, dmtx.get(), + drow_nnz.get_data()); GKO_ASSERT_ARRAY_EQ(row_nnz, drow_nnz); } @@ -713,7 +681,7 @@ TEST_F(Csr, ConvertToHybridIsEquivalentToRef) mtx->convert_to(hybrid_mtx.get()); dmtx->convert_to(dhybrid_mtx.get()); - GKO_ASSERT_MTX_NEAR(hybrid_mtx.get(), dhybrid_mtx.get(), r::value); + GKO_ASSERT_MTX_NEAR(hybrid_mtx.get(), dhybrid_mtx.get(), 0); } @@ -729,7 +697,7 @@ TEST_F(Csr, MoveToHybridIsEquivalentToRef) mtx->move_to(hybrid_mtx.get()); dmtx->move_to(dhybrid_mtx.get()); - GKO_ASSERT_MTX_NEAR(hybrid_mtx.get(), dhybrid_mtx.get(), r::value); + GKO_ASSERT_MTX_NEAR(hybrid_mtx.get(), dhybrid_mtx.get(), 0); } diff --git a/dpcpp/test/matrix/dense_kernels.cpp b/dpcpp/test/matrix/dense_kernels.cpp index ca17d51cc4e..e8976754b98 100644 --- a/dpcpp/test/matrix/dense_kernels.cpp +++ b/dpcpp/test/matrix/dense_kernels.cpp @@ -425,7 +425,7 @@ TEST_F(Dense, ConvertToCooIsEquivalentToRef) ASSERT_EQ(dcoo_mtx->get_num_stored_elements(), coo_mtx->get_num_stored_elements()); - GKO_ASSERT_MTX_NEAR(dcoo_mtx.get(), coo_mtx.get(), r::value); + GKO_ASSERT_MTX_NEAR(dcoo_mtx.get(), coo_mtx.get(), 0); } @@ -440,7 +440,7 @@ TEST_F(Dense, MoveToCooIsEquivalentToRef) ASSERT_EQ(dcoo_mtx->get_num_stored_elements(), coo_mtx->get_num_stored_elements()); - GKO_ASSERT_MTX_NEAR(dcoo_mtx.get(), coo_mtx.get(), r::value); + GKO_ASSERT_MTX_NEAR(dcoo_mtx.get(), coo_mtx.get(), 0); } @@ -453,7 +453,7 @@ TEST_F(Dense, ConvertToCsrIsEquivalentToRef) x->convert_to(csr_mtx.get()); dx->convert_to(dcsr_mtx.get()); - GKO_ASSERT_MTX_NEAR(dcsr_mtx.get(), csr_mtx.get(), r::value); + GKO_ASSERT_MTX_NEAR(dcsr_mtx.get(), csr_mtx.get(), 0); } @@ -466,7 +466,7 @@ TEST_F(Dense, MoveToCsrIsEquivalentToRef) x->move_to(csr_mtx.get()); dx->move_to(dcsr_mtx.get()); - GKO_ASSERT_MTX_NEAR(dcsr_mtx.get(), csr_mtx.get(), r::value); + GKO_ASSERT_MTX_NEAR(dcsr_mtx.get(), csr_mtx.get(), 0); } @@ -479,7 +479,7 @@ TEST_F(Dense, ConvertToEllIsEquivalentToRef) x->convert_to(ell_mtx.get()); dx->convert_to(dell_mtx.get()); - GKO_ASSERT_MTX_NEAR(dell_mtx.get(), ell_mtx.get(), r::value); + GKO_ASSERT_MTX_NEAR(dell_mtx.get(), ell_mtx.get(), 0); } @@ -492,7 +492,7 @@ TEST_F(Dense, MoveToEllIsEquivalentToRef) x->move_to(ell_mtx.get()); dx->move_to(dell_mtx.get()); - GKO_ASSERT_MTX_NEAR(dell_mtx.get(), ell_mtx.get(), r::value); + GKO_ASSERT_MTX_NEAR(dell_mtx.get(), ell_mtx.get(), 0); } @@ -505,7 +505,7 @@ TEST_F(Dense, ConvertToSellpIsEquivalentToRef) x->convert_to(sellp_mtx.get()); dx->convert_to(dsellp_mtx.get()); - GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, r::value); + GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, 0); } @@ -518,7 +518,7 @@ TEST_F(Dense, MoveToSellpIsEquivalentToRef) x->move_to(sellp_mtx.get()); dx->move_to(dsellp_mtx.get()); - GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, r::value); + GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, 0); } @@ -534,19 +534,6 @@ TEST_F(Dense, ConvertsEmptyToSellp) } -TEST_F(Dense, CountNNZIsEquivalentToRef) -{ - set_up_apply_data(); - gko::size_type nnz; - gko::size_type dnnz; - - gko::kernels::reference::dense::count_nonzeros(ref, x.get(), &nnz); - gko::kernels::dpcpp::dense::count_nonzeros(dpcpp, dx.get(), &dnnz); - - ASSERT_EQ(nnz, dnnz); -} - - TEST_F(Dense, CalculateNNZPerRowIsEquivalentToRef) { set_up_apply_data(); @@ -555,10 +542,10 @@ TEST_F(Dense, CalculateNNZPerRowIsEquivalentToRef) gko::Array dnnz_per_row(dpcpp); dnnz_per_row.resize_and_reset(dx->get_size()[0]); - gko::kernels::reference::dense::calculate_nonzeros_per_row(ref, x.get(), - &nnz_per_row); - gko::kernels::dpcpp::dense::calculate_nonzeros_per_row(dpcpp, dx.get(), - &dnnz_per_row); + gko::kernels::reference::dense::count_nonzeros_per_row( + ref, x.get(), nnz_per_row.get_data()); + gko::kernels::dpcpp::dense::count_nonzeros_per_row(dpcpp, dx.get(), + dnnz_per_row.get_data()); auto tmp = gko::Array(ref, dnnz_per_row); for (gko::size_type i = 0; i < nnz_per_row.get_num_elems(); i++) { @@ -567,34 +554,19 @@ TEST_F(Dense, CalculateNNZPerRowIsEquivalentToRef) } -TEST_F(Dense, CalculateMaxNNZPerRowIsEquivalentToRef) +TEST_F(Dense, ComputeMaxNNZPerRowIsEquivalentToRef) { set_up_apply_data(); gko::size_type max_nnz; gko::size_type dmax_nnz; - gko::kernels::reference::dense::calculate_max_nnz_per_row(ref, x.get(), - &max_nnz); - gko::kernels::dpcpp::dense::calculate_max_nnz_per_row(dpcpp, dx.get(), - &dmax_nnz); + gko::kernels::reference::dense::compute_max_nnz_per_row(ref, x.get(), + max_nnz); + gko::kernels::dpcpp::dense::compute_max_nnz_per_row(dpcpp, dx.get(), + dmax_nnz); ASSERT_EQ(max_nnz, dmax_nnz); } -TEST_F(Dense, CalculateTotalColsIsEquivalentToRef) -{ - set_up_apply_data(); - gko::size_type total_cols; - gko::size_type dtotal_cols; - - gko::kernels::reference::dense::calculate_total_cols( - ref, x.get(), &total_cols, 2, gko::matrix::default_slice_size); - gko::kernels::dpcpp::dense::calculate_total_cols( - dpcpp, dx.get(), &dtotal_cols, 2, gko::matrix::default_slice_size); - - ASSERT_EQ(total_cols, dtotal_cols); -} - - } // namespace diff --git a/dpcpp/test/matrix/ell_kernels.cpp b/dpcpp/test/matrix/ell_kernels.cpp index daad34a12b6..c8cd0503561 100644 --- a/dpcpp/test/matrix/ell_kernels.cpp +++ b/dpcpp/test/matrix/ell_kernels.cpp @@ -560,7 +560,7 @@ TEST_F(Ell, ConvertToDenseIsEquivalentToRef) mtx->convert_to(dense_mtx.get()); dmtx->convert_to(ddense_mtx.get()); - GKO_ASSERT_MTX_NEAR(dense_mtx.get(), ddense_mtx.get(), r::value); + GKO_ASSERT_MTX_NEAR(dense_mtx.get(), ddense_mtx.get(), 0); } @@ -574,7 +574,7 @@ TEST_F(Ell, ConvertToCsrIsEquivalentToRef) mtx->convert_to(csr_mtx.get()); dmtx->convert_to(dcsr_mtx.get()); - GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), r::value); + GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), 0); } @@ -582,37 +582,20 @@ TEST_F(Ell, CalculateNNZPerRowIsEquivalentToRef) { set_up_apply_data(); - gko::Array nnz_per_row; + gko::Array nnz_per_row; nnz_per_row.set_executor(ref); nnz_per_row.resize_and_reset(mtx->get_size()[0]); - gko::Array dnnz_per_row; + gko::Array dnnz_per_row; dnnz_per_row.set_executor(dpcpp); dnnz_per_row.resize_and_reset(dmtx->get_size()[0]); - gko::kernels::reference::ell::calculate_nonzeros_per_row(ref, mtx.get(), - &nnz_per_row); - gko::kernels::dpcpp::ell::calculate_nonzeros_per_row(dpcpp, dmtx.get(), - &dnnz_per_row); + gko::kernels::reference::ell::count_nonzeros_per_row( + ref, mtx.get(), nnz_per_row.get_data()); + gko::kernels::dpcpp::ell::count_nonzeros_per_row(dpcpp, dmtx.get(), + dnnz_per_row.get_data()); - auto tmp = gko::Array(ref, dnnz_per_row); - for (auto i = 0; i < nnz_per_row.get_num_elems(); i++) { - ASSERT_EQ(nnz_per_row.get_const_data()[i], tmp.get_const_data()[i]); - } -} - - -TEST_F(Ell, CountNNZIsEquivalentToRef) -{ - set_up_apply_data(); - - gko::size_type nnz; - gko::size_type dnnz; - - gko::kernels::reference::ell::count_nonzeros(ref, mtx.get(), &nnz); - gko::kernels::dpcpp::ell::count_nonzeros(dpcpp, dmtx.get(), &dnnz); - - ASSERT_EQ(nnz, dnnz); + GKO_ASSERT_ARRAY_EQ(nnz_per_row, dnnz_per_row); } diff --git a/dpcpp/test/matrix/hybrid_kernels.cpp b/dpcpp/test/matrix/hybrid_kernels.cpp index 7c76bb1ee18..0b149c25731 100644 --- a/dpcpp/test/matrix/hybrid_kernels.cpp +++ b/dpcpp/test/matrix/hybrid_kernels.cpp @@ -212,19 +212,6 @@ TEST_F(Hybrid, AdvancedApplyToComplexIsEquivalentToRef) } -TEST_F(Hybrid, CountNonzerosIsEquivalentToRef) -{ - set_up_apply_data(); - gko::size_type nonzeros; - gko::size_type dnonzeros; - - gko::kernels::reference::hybrid::count_nonzeros(ref, mtx.get(), &nonzeros); - gko::kernels::dpcpp::hybrid::count_nonzeros(dpcpp, dmtx.get(), &dnonzeros); - - ASSERT_EQ(nonzeros, dnonzeros); -} - - TEST_F(Hybrid, ConvertToCsrIsEquivalentToRef) { set_up_apply_data(1, std::make_shared(2)); @@ -234,7 +221,7 @@ TEST_F(Hybrid, ConvertToCsrIsEquivalentToRef) mtx->convert_to(csr_mtx.get()); dmtx->convert_to(dcsr_mtx.get()); - GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), r::value); + GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), 0); } @@ -247,7 +234,7 @@ TEST_F(Hybrid, MoveToCsrIsEquivalentToRef) mtx->move_to(csr_mtx.get()); dmtx->move_to(dcsr_mtx.get()); - GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), r::value); + GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), 0); } diff --git a/dpcpp/test/matrix/sellp_kernels.cpp b/dpcpp/test/matrix/sellp_kernels.cpp index 308f48d77aa..826f0f86c86 100644 --- a/dpcpp/test/matrix/sellp_kernels.cpp +++ b/dpcpp/test/matrix/sellp_kernels.cpp @@ -254,7 +254,7 @@ TEST_F(Sellp, ConvertToDenseIsEquivalentToRef) mtx->convert_to(dense_mtx.get()); dmtx->convert_to(ddense_mtx.get()); - GKO_ASSERT_MTX_NEAR(dense_mtx.get(), ddense_mtx.get(), r::value); + GKO_ASSERT_MTX_NEAR(dense_mtx.get(), ddense_mtx.get(), 0); } @@ -267,7 +267,7 @@ TEST_F(Sellp, ConvertToCsrIsEquivalentToRef) mtx->convert_to(csr_mtx.get()); dmtx->convert_to(dcsr_mtx.get()); - GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), r::value); + GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), 0); } @@ -297,19 +297,6 @@ TEST_F(Sellp, ConvertEmptyToCsrIsEquivalentToRef) } -TEST_F(Sellp, CountNonzerosIsEquivalentToRef) -{ - set_up_apply_matrix(64); - gko::size_type nnz; - gko::size_type dnnz; - - gko::kernels::reference::sellp::count_nonzeros(ref, mtx.get(), &nnz); - gko::kernels::dpcpp::sellp::count_nonzeros(dpcpp, dmtx.get(), &dnnz); - - ASSERT_EQ(nnz, dnnz); -} - - TEST_F(Sellp, ExtractDiagonalIsEquivalentToRef) { set_up_apply_matrix(64); diff --git a/hip/matrix/csr_kernels.hip.cpp b/hip/matrix/csr_kernels.hip.cpp index cc60837eb3f..265ce8c6ac0 100644 --- a/hip/matrix/csr_kernels.hip.cpp +++ b/hip/matrix/csr_kernels.hip.cpp @@ -719,160 +719,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL); -template -void convert_to_sellp(std::shared_ptr exec, - const matrix::Csr* source, - matrix::Sellp* result) -{ - const auto num_rows = result->get_size()[0]; - const auto num_cols = result->get_size()[1]; - - auto result_values = result->get_values(); - auto result_col_idxs = result->get_col_idxs(); - auto slice_lengths = result->get_slice_lengths(); - auto slice_sets = result->get_slice_sets(); - - const auto slice_size = (result->get_slice_size() == 0) - ? matrix::default_slice_size - : result->get_slice_size(); - const auto stride_factor = (result->get_stride_factor() == 0) - ? matrix::default_stride_factor - : result->get_stride_factor(); - const int slice_num = ceildiv(num_rows, slice_size); - - const auto source_values = source->get_const_values(); - const auto source_row_ptrs = source->get_const_row_ptrs(); - const auto source_col_idxs = source->get_const_col_idxs(); - - auto nnz_per_row = Array(exec, num_rows); - auto grid_dim = ceildiv(num_rows, default_block_size); - - if (grid_dim > 0) { - hipLaunchKernelGGL(kernel::calculate_nnz_per_row, dim3(grid_dim), - dim3(default_block_size), 0, 0, num_rows, - as_hip_type(source_row_ptrs), - as_hip_type(nnz_per_row.get_data())); - } - - grid_dim = slice_num; - - if (grid_dim > 0) { - hipLaunchKernelGGL(kernel::calculate_slice_lengths, dim3(grid_dim), - dim3(config::warp_size), 0, 0, num_rows, slice_size, - stride_factor, - as_hip_type(nnz_per_row.get_const_data()), - as_hip_type(slice_lengths), as_hip_type(slice_sets)); - } - - components::prefix_sum(exec, slice_sets, slice_num + 1); - - grid_dim = ceildiv(num_rows, default_block_size); - if (grid_dim > 0) { - hipLaunchKernelGGL( - kernel::fill_in_sellp, dim3(grid_dim), dim3(default_block_size), 0, - 0, num_rows, slice_size, as_hip_type(source_values), - as_hip_type(source_row_ptrs), as_hip_type(source_col_idxs), - as_hip_type(slice_lengths), as_hip_type(slice_sets), - as_hip_type(result_col_idxs), as_hip_type(result_values)); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CONVERT_TO_SELLP_KERNEL); - - -template -void convert_to_ell(std::shared_ptr exec, - const matrix::Csr* source, - matrix::Ell* result) -{ - const auto source_values = source->get_const_values(); - const auto source_row_ptrs = source->get_const_row_ptrs(); - const auto source_col_idxs = source->get_const_col_idxs(); - - auto result_values = result->get_values(); - auto result_col_idxs = result->get_col_idxs(); - const auto stride = result->get_stride(); - const auto max_nnz_per_row = result->get_num_stored_elements_per_row(); - const auto num_rows = result->get_size()[0]; - const auto num_cols = result->get_size()[1]; - - const auto init_grid_dim = - ceildiv(max_nnz_per_row * num_rows, default_block_size); - - hipLaunchKernelGGL(kernel::initialize_zero_ell, dim3(init_grid_dim), - dim3(default_block_size), 0, 0, max_nnz_per_row, stride, - as_hip_type(result_values), - as_hip_type(result_col_idxs)); - - const auto grid_dim = - ceildiv(num_rows * config::warp_size, default_block_size); - - hipLaunchKernelGGL(kernel::fill_in_ell, dim3(grid_dim), - dim3(default_block_size), 0, 0, num_rows, stride, - as_hip_type(source_values), as_hip_type(source_row_ptrs), - as_hip_type(source_col_idxs), as_hip_type(result_values), - as_hip_type(result_col_idxs)); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CONVERT_TO_ELL_KERNEL); - - -template -void calculate_total_cols(std::shared_ptr exec, - const matrix::Csr* source, - size_type* result, size_type stride_factor, - size_type slice_size) -{ - const auto num_rows = source->get_size()[0]; - - if (num_rows == 0) { - *result = 0; - return; - } - - const auto slice_num = ceildiv(num_rows, slice_size); - const auto row_ptrs = source->get_const_row_ptrs(); - - auto nnz_per_row = Array(exec, num_rows); - auto grid_dim = ceildiv(num_rows, default_block_size); - - hipLaunchKernelGGL(kernel::calculate_nnz_per_row, dim3(grid_dim), - dim3(default_block_size), 0, 0, num_rows, - as_hip_type(row_ptrs), - as_hip_type(nnz_per_row.get_data())); - - grid_dim = ceildiv(slice_num * config::warp_size, default_block_size); - auto max_nnz_per_slice = Array(exec, slice_num); - - hipLaunchKernelGGL(kernel::reduce_max_nnz_per_slice, dim3(grid_dim), - dim3(default_block_size), 0, 0, num_rows, slice_size, - stride_factor, as_hip_type(nnz_per_row.get_const_data()), - as_hip_type(max_nnz_per_slice.get_data())); - - grid_dim = ceildiv(slice_num, default_block_size); - auto block_results = Array(exec, grid_dim); - - hipLaunchKernelGGL(kernel::reduce_total_cols, dim3(grid_dim), - dim3(default_block_size), 0, 0, slice_num, - as_hip_type(max_nnz_per_slice.get_const_data()), - as_hip_type(block_results.get_data())); - - auto d_result = Array(exec, 1); - - hipLaunchKernelGGL(kernel::reduce_total_cols, dim3(1), - dim3(default_block_size), 0, 0, grid_dim, - as_hip_type(block_results.get_const_data()), - as_hip_type(d_result.get_data())); - - *result = exec->copy_val_to_host(d_result.get_const_data()); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CALCULATE_TOTAL_COLS_KERNEL); - - template void transpose(std::shared_ptr exec, const matrix::Csr* orig, @@ -1008,42 +854,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL); -template -void calculate_max_nnz_per_row(std::shared_ptr exec, - const matrix::Csr* source, - size_type* result) -{ - const auto num_rows = source->get_size()[0]; - - auto nnz_per_row = Array(exec, num_rows); - auto block_results = Array(exec, default_block_size); - auto d_result = Array(exec, 1); - - const auto grid_dim = ceildiv(num_rows, default_block_size); - hipLaunchKernelGGL(kernel::calculate_nnz_per_row, dim3(grid_dim), - dim3(default_block_size), 0, 0, num_rows, - as_hip_type(source->get_const_row_ptrs()), - as_hip_type(nnz_per_row.get_data())); - - const auto n = ceildiv(num_rows, default_block_size); - const auto reduce_dim = n <= default_block_size ? n : default_block_size; - hipLaunchKernelGGL(kernel::reduce_max_nnz, dim3(reduce_dim), - dim3(default_block_size), 0, 0, num_rows, - as_hip_type(nnz_per_row.get_const_data()), - as_hip_type(block_results.get_data())); - - hipLaunchKernelGGL(kernel::reduce_max_nnz, dim3(1), - dim3(default_block_size), 0, 0, reduce_dim, - as_hip_type(block_results.get_const_data()), - as_hip_type(d_result.get_data())); - - *result = exec->copy_val_to_host(d_result.get_const_data()); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL); - - template void calculate_nonzeros_per_row_in_span( std::shared_ptr exec, @@ -1094,70 +904,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL); -template -void convert_to_hybrid(std::shared_ptr exec, - const matrix::Csr* source, - matrix::Hybrid* result) -{ - auto ell_val = result->get_ell_values(); - auto ell_col = result->get_ell_col_idxs(); - auto coo_val = result->get_coo_values(); - auto coo_col = result->get_coo_col_idxs(); - auto coo_row = result->get_coo_row_idxs(); - const auto stride = result->get_ell_stride(); - const auto max_nnz_per_row = result->get_ell_num_stored_elements_per_row(); - const auto num_rows = result->get_size()[0]; - const auto coo_num_stored_elements = result->get_coo_num_stored_elements(); - auto grid_dim = ceildiv(max_nnz_per_row * num_rows, default_block_size); - - hipLaunchKernelGGL(kernel::initialize_zero_ell, dim3(grid_dim), - dim3(default_block_size), 0, 0, max_nnz_per_row, stride, - as_hip_type(ell_val), as_hip_type(ell_col)); - - grid_dim = ceildiv(num_rows, default_block_size); - auto coo_offset = Array(exec, num_rows); - hipLaunchKernelGGL(kernel::calculate_hybrid_coo_row_nnz, dim3(grid_dim), - dim3(default_block_size), 0, 0, num_rows, - max_nnz_per_row, - as_hip_type(source->get_const_row_ptrs()), - as_hip_type(coo_offset.get_data())); - - components::prefix_sum(exec, coo_offset.get_data(), num_rows); - - grid_dim = ceildiv(num_rows * config::warp_size, default_block_size); - hipLaunchKernelGGL(kernel::fill_in_hybrid, dim3(grid_dim), - dim3(default_block_size), 0, 0, num_rows, stride, - max_nnz_per_row, as_hip_type(source->get_const_values()), - as_hip_type(source->get_const_row_ptrs()), - as_hip_type(source->get_const_col_idxs()), - as_hip_type(coo_offset.get_const_data()), - as_hip_type(ell_val), as_hip_type(ell_col), - as_hip_type(coo_val), as_hip_type(coo_col), - as_hip_type(coo_row)); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL); - - -template -void calculate_nonzeros_per_row(std::shared_ptr exec, - const matrix::Csr* source, - Array* result) -{ - const auto num_rows = source->get_size()[0]; - auto row_ptrs = source->get_const_row_ptrs(); - auto grid_dim = ceildiv(num_rows, default_block_size); - - hipLaunchKernelGGL(kernel::calculate_nnz_per_row, dim3(grid_dim), - dim3(default_block_size), 0, 0, num_rows, - as_hip_type(row_ptrs), as_hip_type(result->get_data())); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CALCULATE_NONZEROS_PER_ROW_KERNEL); - - template void sort_by_column_index(std::shared_ptr exec, matrix::Csr* to_sort) diff --git a/hip/matrix/dense_kernels.hip.cpp b/hip/matrix/dense_kernels.hip.cpp index e24c625bd45..5b74518e23c 100644 --- a/hip/matrix/dense_kernels.hip.cpp +++ b/hip/matrix/dense_kernels.hip.cpp @@ -42,6 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include @@ -51,6 +52,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "hip/base/hipblas_bindings.hip.hpp" #include "hip/base/pointer_mode_guard.hip.hpp" #include "hip/components/cooperative_groups.hip.hpp" +#include "hip/components/intrinsics.hip.hpp" #include "hip/components/reduction.hip.hpp" #include "hip/components/thread_ids.hip.hpp" #include "hip/components/uninitialized_array.hip.hpp" @@ -134,6 +136,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL); template void convert_to_coo(std::shared_ptr exec, const matrix::Dense* source, + const int64* row_ptrs, matrix::Coo* result) { auto num_rows = result->get_size()[0]; @@ -145,19 +148,14 @@ void convert_to_coo(std::shared_ptr exec, auto stride = source->get_stride(); - auto nnz_prefix_sum = Array(exec, num_rows); - calculate_nonzeros_per_row(exec, source, &nnz_prefix_sum); - - components::prefix_sum(exec, nnz_prefix_sum.get_data(), num_rows); - - const size_type grid_dim = ceildiv(num_rows, default_block_size); - - hipLaunchKernelGGL(kernel::fill_in_coo, dim3(grid_dim), - dim3(default_block_size), 0, 0, num_rows, num_cols, - stride, as_hip_type(nnz_prefix_sum.get_const_data()), - as_hip_type(source->get_const_values()), - as_hip_type(row_idxs), as_hip_type(col_idxs), - as_hip_type(values)); + const auto grid_dim = + ceildiv(num_rows, default_block_size / config::warp_size); + if (grid_dim > 0) { + hipLaunchKernelGGL(kernel::fill_in_coo, dim3(grid_dim), + dim3(default_block_size), 0, 0, num_rows, num_cols, + stride, as_hip_type(source->get_const_values()), + row_ptrs, row_idxs, col_idxs, as_hip_type(values)); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -178,22 +176,14 @@ void convert_to_csr(std::shared_ptr exec, auto stride = source->get_stride(); - const auto rows_per_block = ceildiv(default_block_size, config::warp_size); - const auto grid_dim_nnz = ceildiv(source->get_size()[0], rows_per_block); - - hipLaunchKernelGGL(kernel::count_nnz_per_row, dim3(grid_dim_nnz), - dim3(default_block_size), 0, 0, num_rows, num_cols, - stride, as_hip_type(source->get_const_values()), - as_hip_type(row_ptrs)); - - components::prefix_sum(exec, row_ptrs, num_rows + 1); - - size_type grid_dim = ceildiv(num_rows, default_block_size); - - hipLaunchKernelGGL( - kernel::fill_in_csr, dim3(grid_dim), dim3(default_block_size), 0, 0, - num_rows, num_cols, stride, as_hip_type(source->get_const_values()), - as_hip_type(row_ptrs), as_hip_type(col_idxs), as_hip_type(values)); + const auto grid_dim = + ceildiv(num_rows, default_block_size / config::warp_size); + if (grid_dim > 0) { + hipLaunchKernelGGL( + kernel::fill_in_csr, dim3(grid_dim), dim3(default_block_size), 0, 0, + num_rows, num_cols, stride, as_hip_type(source->get_const_values()), + as_hip_type(row_ptrs), as_hip_type(col_idxs), as_hip_type(values)); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -209,18 +199,21 @@ void convert_to_ell(std::shared_ptr exec, auto num_cols = result->get_size()[1]; auto max_nnz_per_row = result->get_num_stored_elements_per_row(); - auto col_ptrs = result->get_col_idxs(); + auto col_idxs = result->get_col_idxs(); auto values = result->get_values(); auto source_stride = source->get_stride(); auto result_stride = result->get_stride(); - auto grid_dim = ceildiv(result_stride, default_block_size); - hipLaunchKernelGGL(kernel::fill_in_ell, dim3(grid_dim), - dim3(default_block_size), 0, 0, num_rows, num_cols, - source_stride, as_hip_type(source->get_const_values()), - max_nnz_per_row, result_stride, as_hip_type(col_ptrs), - as_hip_type(values)); + const auto grid_dim = + ceildiv(num_rows, default_block_size / config::warp_size); + if (grid_dim > 0) { + hipLaunchKernelGGL( + kernel::fill_in_ell, dim3(grid_dim), dim3(default_block_size), 0, 0, + num_rows, num_cols, source_stride, + as_hip_type(source->get_const_values()), max_nnz_per_row, + result_stride, col_idxs, as_hip_type(values)); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -230,8 +223,31 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void convert_to_hybrid(std::shared_ptr exec, const matrix::Dense* source, + const int64* coo_row_ptrs, matrix::Hybrid* result) - GKO_NOT_IMPLEMENTED; +{ + const auto num_rows = result->get_size()[0]; + const auto num_cols = result->get_size()[1]; + const auto ell_max_nnz_per_row = + result->get_ell_num_stored_elements_per_row(); + const auto source_stride = source->get_stride(); + const auto ell_stride = result->get_ell_stride(); + auto ell_col_idxs = result->get_ell_col_idxs(); + auto ell_values = result->get_ell_values(); + auto coo_row_idxs = result->get_coo_row_idxs(); + auto coo_col_idxs = result->get_coo_col_idxs(); + auto coo_values = result->get_coo_values(); + + auto grid_dim = ceildiv(num_rows, default_block_size / config::warp_size); + if (grid_dim > 0) { + hipLaunchKernelGGL(kernel::fill_in_hybrid, grid_dim, default_block_size, + 0, 0, num_rows, num_cols, source_stride, + as_hip_type(source->get_const_values()), + ell_max_nnz_per_row, ell_stride, ell_col_idxs, + as_hip_type(ell_values), coo_row_ptrs, coo_row_idxs, + coo_col_idxs, as_hip_type(coo_values)); + } +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DENSE_CONVERT_TO_HYBRID_KERNEL); @@ -257,24 +273,8 @@ void convert_to_sellp(std::shared_ptr exec, const auto stride_factor = (result->get_stride_factor() == 0) ? matrix::default_stride_factor : result->get_stride_factor(); - const int slice_num = ceildiv(num_rows, slice_size); - - auto nnz_per_row = Array(exec, num_rows); - calculate_nonzeros_per_row(exec, source, &nnz_per_row); - - auto grid_dim = slice_num; - - if (grid_dim > 0) { - hipLaunchKernelGGL(kernel::calculate_slice_lengths, dim3(grid_dim), - dim3(config::warp_size), 0, 0, num_rows, slice_size, - slice_num, stride_factor, - as_hip_type(nnz_per_row.get_const_data()), - as_hip_type(slice_lengths), as_hip_type(slice_sets)); - } - - components::prefix_sum(exec, slice_sets, slice_num + 1); - grid_dim = ceildiv(num_rows, default_block_size); + auto grid_dim = ceildiv(num_rows, default_block_size / config::warp_size); if (grid_dim > 0) { hipLaunchKernelGGL( kernel::fill_in_sellp, dim3(grid_dim), dim3(default_block_size), 0, @@ -298,133 +298,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DENSE_CONVERT_TO_SPARSITY_CSR_KERNEL); -template -void count_nonzeros(std::shared_ptr exec, - const matrix::Dense* source, size_type* result) -{ - const auto num_rows = source->get_size()[0]; - auto nnz_per_row = Array(exec, num_rows); - - calculate_nonzeros_per_row(exec, source, &nnz_per_row); - - *result = reduce_add_array(exec, num_rows, nnz_per_row.get_const_data()); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COUNT_NONZEROS_KERNEL); - - -template -void calculate_max_nnz_per_row(std::shared_ptr exec, - const matrix::Dense* source, - size_type* result) -{ - const auto num_rows = source->get_size()[0]; - auto nnz_per_row = Array(exec, num_rows); - - calculate_nonzeros_per_row(exec, source, &nnz_per_row); - - const auto n = ceildiv(num_rows, default_block_size); - const size_type grid_dim = - (n <= default_block_size) ? n : default_block_size; - - auto block_results = Array(exec, grid_dim); - - hipLaunchKernelGGL(kernel::reduce_max_nnz, dim3(grid_dim), - dim3(default_block_size), - default_block_size * sizeof(size_type), 0, num_rows, - as_hip_type(nnz_per_row.get_const_data()), - as_hip_type(block_results.get_data())); - - auto d_result = Array(exec, 1); - - hipLaunchKernelGGL(kernel::reduce_max_nnz, dim3(1), - dim3(default_block_size), - default_block_size * sizeof(size_type), 0, grid_dim, - as_hip_type(block_results.get_const_data()), - as_hip_type(d_result.get_data())); - - *result = exec->copy_val_to_host(d_result.get_const_data()); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_CALCULATE_MAX_NNZ_PER_ROW_KERNEL); - - -template -void calculate_nonzeros_per_row(std::shared_ptr exec, - const matrix::Dense* source, - Array* result) -{ - const dim3 block_size(default_block_size, 1, 1); - auto rows_per_block = ceildiv(default_block_size, config::warp_size); - const size_t grid_x = ceildiv(source->get_size()[0], rows_per_block); - const dim3 grid_size(grid_x, 1, 1); - if (grid_x > 0) { - hipLaunchKernelGGL(kernel::count_nnz_per_row, dim3(grid_size), - dim3(block_size), 0, 0, source->get_size()[0], - source->get_size()[1], source->get_stride(), - as_hip_type(source->get_const_values()), - as_hip_type(result->get_data())); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_CALCULATE_NONZEROS_PER_ROW_KERNEL); - - -template -void calculate_total_cols(std::shared_ptr exec, - const matrix::Dense* source, - size_type* result, size_type stride_factor, - size_type slice_size) -{ - const auto num_rows = source->get_size()[0]; - - if (num_rows == 0) { - *result = 0; - return; - } - - const auto num_cols = source->get_size()[1]; - const auto slice_num = ceildiv(num_rows, slice_size); - - auto nnz_per_row = Array(exec, num_rows); - - calculate_nonzeros_per_row(exec, source, &nnz_per_row); - - auto max_nnz_per_slice = Array(exec, slice_num); - - auto grid_dim = ceildiv(slice_num * config::warp_size, default_block_size); - - hipLaunchKernelGGL(kernel::reduce_max_nnz_per_slice, dim3(grid_dim), - dim3(default_block_size), 0, 0, num_rows, slice_size, - stride_factor, as_hip_type(nnz_per_row.get_const_data()), - as_hip_type(max_nnz_per_slice.get_data())); - - grid_dim = ceildiv(slice_num, default_block_size); - auto block_results = Array(exec, grid_dim); - - hipLaunchKernelGGL(kernel::reduce_total_cols, dim3(grid_dim), - dim3(default_block_size), - default_block_size * sizeof(size_type), 0, slice_num, - as_hip_type(max_nnz_per_slice.get_const_data()), - as_hip_type(block_results.get_data())); - - auto d_result = Array(exec, 1); - - hipLaunchKernelGGL(kernel::reduce_total_cols, dim3(1), - dim3(default_block_size), - default_block_size * sizeof(size_type), 0, grid_dim, - as_hip_type(block_results.get_const_data()), - as_hip_type(d_result.get_data())); - - *result = exec->copy_val_to_host(d_result.get_const_data()); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_CALCULATE_TOTAL_COLS_KERNEL); - - template void transpose(std::shared_ptr exec, const matrix::Dense* orig, diff --git a/hip/matrix/ell_kernels.hip.cpp b/hip/matrix/ell_kernels.hip.cpp index 5c90e03adfb..ad5d5132087 100644 --- a/hip/matrix/ell_kernels.hip.cpp +++ b/hip/matrix/ell_kernels.hip.cpp @@ -290,31 +290,6 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL); -template -void extract_diagonal(std::shared_ptr exec, - const matrix::Ell* orig, - matrix::Diagonal* diag) -{ - const auto max_nnz_per_row = orig->get_num_stored_elements_per_row(); - const auto orig_stride = orig->get_stride(); - const auto diag_size = diag->get_size()[0]; - const auto num_blocks = - ceildiv(diag_size * max_nnz_per_row, default_block_size); - - const auto orig_values = orig->get_const_values(); - const auto orig_col_idxs = orig->get_const_col_idxs(); - auto diag_values = diag->get_values(); - - hipLaunchKernelGGL(kernel::extract_diagonal, dim3(num_blocks), - dim3(default_block_size), 0, 0, diag_size, - max_nnz_per_row, orig_stride, as_hip_type(orig_values), - as_hip_type(orig_col_idxs), as_hip_type(diag_values)); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_ELL_EXTRACT_DIAGONAL_KERNEL); - - } // namespace ell } // namespace hip } // namespace kernels diff --git a/hip/matrix/fbcsr_kernels.hip.cpp b/hip/matrix/fbcsr_kernels.hip.cpp index a9700699199..2739a173fcc 100644 --- a/hip/matrix/fbcsr_kernels.hip.cpp +++ b/hip/matrix/fbcsr_kernels.hip.cpp @@ -111,7 +111,17 @@ template void convert_to_csr(const std::shared_ptr exec, const matrix::Fbcsr* const source, matrix::Csr* const result) - GKO_NOT_IMPLEMENTED; +{ + constexpr auto warps_per_block = default_block_size / config::warp_size; + const auto num_blocks = + ceildiv(source->get_num_block_rows(), warps_per_block); + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::convert_to_csr), num_blocks, default_block_size, + 0, 0, source->get_const_row_ptrs(), source->get_const_col_idxs(), + as_hip_type(source->get_const_values()), result->get_row_ptrs(), + result->get_col_idxs(), as_hip_type(result->get_values()), + source->get_num_block_rows(), source->get_block_size()); +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_FBCSR_CONVERT_TO_CSR_KERNEL); @@ -136,26 +146,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_FBCSR_CONJ_TRANSPOSE_KERNEL); -template -void calculate_max_nnz_per_row( - std::shared_ptr exec, - const matrix::Fbcsr* source, - size_type* result) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_FBCSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL); - - -template -void calculate_nonzeros_per_row( - std::shared_ptr exec, - const matrix::Fbcsr* source, - Array* result) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_FBCSR_CALCULATE_NONZEROS_PER_ROW_KERNEL); - - template void is_sorted_by_column_index( std::shared_ptr exec, diff --git a/hip/matrix/hybrid_kernels.hip.cpp b/hip/matrix/hybrid_kernels.hip.cpp index 7883b4ce190..acd067ea28b 100644 --- a/hip/matrix/hybrid_kernels.hip.cpp +++ b/hip/matrix/hybrid_kernels.hip.cpp @@ -41,24 +41,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include -#include - - #include "common/unified/base/kernel_launch.hpp" -#include "core/components/fill_array_kernels.hpp" -#include "core/components/format_conversion_kernels.hpp" -#include "core/components/prefix_sum_kernels.hpp" -#include "core/matrix/coo_kernels.hpp" -#include "core/matrix/ell_kernels.hpp" -#include "hip/base/config.hip.hpp" #include "hip/base/types.hip.hpp" -#include "hip/components/atomic.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" -#include "hip/components/format_conversion.hip.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/segment_scan.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" namespace gko { @@ -72,10 +56,6 @@ namespace hip { namespace hybrid { -constexpr int default_block_size = 512; -constexpr int warps_in_block = 4; - - #include "common/cuda_hip/matrix/hybrid_kernels.hpp.inc" diff --git a/hip/matrix/sellp_kernels.hip.cpp b/hip/matrix/sellp_kernels.hip.cpp index d1a5ae9747d..08efddec844 100644 --- a/hip/matrix/sellp_kernels.hip.cpp +++ b/hip/matrix/sellp_kernels.hip.cpp @@ -114,109 +114,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_SELLP_ADVANCED_SPMV_KERNEL); -template -void convert_to_csr(std::shared_ptr exec, - const matrix::Sellp* source, - matrix::Csr* result) -{ - const auto num_rows = source->get_size()[0]; - const auto slice_size = source->get_slice_size(); - const auto slice_num = ceildiv(num_rows, slice_size); - - const auto source_values = source->get_const_values(); - const auto source_slice_lengths = source->get_const_slice_lengths(); - const auto source_slice_sets = source->get_const_slice_sets(); - const auto source_col_idxs = source->get_const_col_idxs(); - - auto result_values = result->get_values(); - auto result_col_idxs = result->get_col_idxs(); - auto result_row_ptrs = result->get_row_ptrs(); - - auto grid_dim = ceildiv(num_rows * config::warp_size, default_block_size); - - if (grid_dim > 0) { - hipLaunchKernelGGL( - kernel::count_nnz_per_row, dim3(grid_dim), dim3(default_block_size), - 0, 0, num_rows, slice_size, as_hip_type(source_slice_sets), - as_hip_type(source_values), as_hip_type(result_row_ptrs)); - } - - components::prefix_sum(exec, result_row_ptrs, num_rows + 1); - - grid_dim = ceildiv(num_rows, default_block_size); - - if (grid_dim > 0) { - hipLaunchKernelGGL( - kernel::fill_in_csr, dim3(grid_dim), dim3(default_block_size), 0, 0, - num_rows, slice_size, as_hip_type(source_slice_sets), - as_hip_type(source_col_idxs), as_hip_type(source_values), - as_hip_type(result_row_ptrs), as_hip_type(result_col_idxs), - as_hip_type(result_values)); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_SELLP_CONVERT_TO_CSR_KERNEL); - - -template -void count_nonzeros(std::shared_ptr exec, - const matrix::Sellp* source, - size_type* result) -{ - const auto num_rows = source->get_size()[0]; - - if (num_rows <= 0) { - *result = 0; - return; - } - - const auto slice_size = source->get_slice_size(); - const auto slice_sets = source->get_const_slice_sets(); - const auto values = source->get_const_values(); - - auto nnz_per_row = Array(exec, num_rows); - - auto grid_dim = ceildiv(num_rows * config::warp_size, default_block_size); - - hipLaunchKernelGGL(kernel::count_nnz_per_row, dim3(grid_dim), - dim3(default_block_size), 0, 0, num_rows, slice_size, - as_hip_type(slice_sets), as_hip_type(values), - as_hip_type(nnz_per_row.get_data())); - - *result = reduce_add_array(exec, num_rows, nnz_per_row.get_const_data()); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_SELLP_COUNT_NONZEROS_KERNEL); - - -template -void extract_diagonal(std::shared_ptr exec, - const matrix::Sellp* orig, - matrix::Diagonal* diag) -{ - const auto diag_size = diag->get_size()[0]; - const auto slice_size = orig->get_slice_size(); - const auto slice_num = ceildiv(diag_size, slice_size); - const auto num_blocks = - ceildiv(slice_num * config::warp_size, default_block_size); - - const auto orig_slice_sets = orig->get_const_slice_sets(); - const auto orig_values = orig->get_const_values(); - const auto orig_col_idxs = orig->get_const_col_idxs(); - auto diag_values = diag->get_values(); - - hipLaunchKernelGGL(kernel::extract_diagonal, dim3(num_blocks), - dim3(default_block_size), 0, 0, diag_size, slice_size, - as_hip_type(orig_slice_sets), as_hip_type(orig_values), - as_hip_type(orig_col_idxs), as_hip_type(diag_values)); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_SELLP_EXTRACT_DIAGONAL_KERNEL); - - } // namespace sellp } // namespace hip } // namespace kernels diff --git a/hip/matrix/sparsity_csr_kernels.hip.cpp b/hip/matrix/sparsity_csr_kernels.hip.cpp index ab848338db3..f7e4d7643e2 100644 --- a/hip/matrix/sparsity_csr_kernels.hip.cpp +++ b/hip/matrix/sparsity_csr_kernels.hip.cpp @@ -69,6 +69,15 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL); +template +void fill_in_dense(std::shared_ptr exec, + const matrix::SparsityCsr* input, + matrix::Dense* output) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_SPARSITY_CSR_FILL_IN_DENSE_KERNEL); + + template void count_num_diagonal_elements( std::shared_ptr exec, diff --git a/hip/test/matrix/coo_kernels.hip.cpp b/hip/test/matrix/coo_kernels.hip.cpp index 98647447784..3cceb98a9fc 100644 --- a/hip/test/matrix/coo_kernels.hip.cpp +++ b/hip/test/matrix/coo_kernels.hip.cpp @@ -337,7 +337,7 @@ TEST_F(Coo, ConvertToDenseIsEquivalentToRef) mtx->convert_to(dense_mtx.get()); dmtx->convert_to(ddense_mtx.get()); - GKO_ASSERT_MTX_NEAR(dense_mtx.get(), ddense_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(dense_mtx.get(), ddense_mtx.get(), 0); } @@ -352,7 +352,7 @@ TEST_F(Coo, ConvertToCsrIsEquivalentToRef) dense_mtx->convert_to(csr_mtx.get()); dmtx->convert_to(dcsr_mtx.get()); - GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), 0); } diff --git a/hip/test/matrix/csr_kernels.hip.cpp b/hip/test/matrix/csr_kernels.hip.cpp index 7a8966375eb..3e03cc5cd1e 100644 --- a/hip/test/matrix/csr_kernels.hip.cpp +++ b/hip/test/matrix/csr_kernels.hip.cpp @@ -530,7 +530,7 @@ TEST_F(Csr, ConvertToDenseIsEquivalentToRef) mtx->convert_to(dense_mtx.get()); dmtx->convert_to(ddense_mtx.get()); - GKO_ASSERT_MTX_NEAR(dense_mtx.get(), ddense_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(dense_mtx.get(), ddense_mtx.get(), 0); } @@ -543,7 +543,7 @@ TEST_F(Csr, MoveToDenseIsEquivalentToRef) mtx->move_to(dense_mtx.get()); dmtx->move_to(ddense_mtx.get()); - GKO_ASSERT_MTX_NEAR(dense_mtx.get(), ddense_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(dense_mtx.get(), ddense_mtx.get(), 0); } @@ -556,7 +556,7 @@ TEST_F(Csr, ConvertToEllIsEquivalentToRef) mtx->convert_to(ell_mtx.get()); dmtx->convert_to(dell_mtx.get()); - GKO_ASSERT_MTX_NEAR(ell_mtx.get(), dell_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(ell_mtx.get(), dell_mtx.get(), 0); } @@ -569,7 +569,7 @@ TEST_F(Csr, MoveToEllIsEquivalentToRef) mtx->move_to(ell_mtx.get()); dmtx->move_to(dell_mtx.get()); - GKO_ASSERT_MTX_NEAR(ell_mtx.get(), dell_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(ell_mtx.get(), dell_mtx.get(), 0); } @@ -582,7 +582,7 @@ TEST_F(Csr, ConvertToSparsityCsrIsEquivalentToRef) mtx->convert_to(sparsity_mtx.get()); dmtx->convert_to(d_sparsity_mtx.get()); - GKO_ASSERT_MTX_NEAR(sparsity_mtx.get(), d_sparsity_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(sparsity_mtx.get(), d_sparsity_mtx.get(), 0); } @@ -595,22 +595,7 @@ TEST_F(Csr, MoveToSparsityCsrIsEquivalentToRef) mtx->move_to(sparsity_mtx.get()); dmtx->move_to(d_sparsity_mtx.get()); - GKO_ASSERT_MTX_NEAR(sparsity_mtx.get(), d_sparsity_mtx.get(), 1e-14); -} - - -TEST_F(Csr, CalculateMaxNnzPerRowIsEquivalentToRef) -{ - set_up_apply_data(std::make_shared()); - gko::size_type max_nnz_per_row; - gko::size_type dmax_nnz_per_row; - - gko::kernels::reference::csr::calculate_max_nnz_per_row(ref, mtx.get(), - &max_nnz_per_row); - gko::kernels::hip::csr::calculate_max_nnz_per_row(hip, dmtx.get(), - &dmax_nnz_per_row); - - ASSERT_EQ(max_nnz_per_row, dmax_nnz_per_row); + GKO_ASSERT_MTX_NEAR(sparsity_mtx.get(), d_sparsity_mtx.get(), 0); } @@ -623,7 +608,7 @@ TEST_F(Csr, ConvertToCooIsEquivalentToRef) mtx->convert_to(coo_mtx.get()); dmtx->convert_to(dcoo_mtx.get()); - GKO_ASSERT_MTX_NEAR(coo_mtx.get(), dcoo_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(coo_mtx.get(), dcoo_mtx.get(), 0); } @@ -636,7 +621,7 @@ TEST_F(Csr, MoveToCooIsEquivalentToRef) mtx->move_to(coo_mtx.get()); dmtx->move_to(dcoo_mtx.get()); - GKO_ASSERT_MTX_NEAR(coo_mtx.get(), dcoo_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(coo_mtx.get(), dcoo_mtx.get(), 0); } @@ -649,7 +634,7 @@ TEST_F(Csr, ConvertToSellpIsEquivalentToRef) mtx->convert_to(sellp_mtx.get()); dmtx->convert_to(dsellp_mtx.get()); - GKO_ASSERT_MTX_NEAR(sellp_mtx.get(), dsellp_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(sellp_mtx.get(), dsellp_mtx.get(), 0); } @@ -662,7 +647,7 @@ TEST_F(Csr, MoveToSellpIsEquivalentToRef) mtx->move_to(sellp_mtx.get()); dmtx->move_to(dsellp_mtx.get()); - GKO_ASSERT_MTX_NEAR(sellp_mtx.get(), dsellp_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(sellp_mtx.get(), dsellp_mtx.get(), 0); } @@ -678,31 +663,16 @@ TEST_F(Csr, ConvertsEmptyToSellp) } -TEST_F(Csr, CalculateTotalColsIsEquivalentToRef) -{ - set_up_apply_data(std::make_shared()); - gko::size_type total_cols; - gko::size_type dtotal_cols; - - gko::kernels::reference::csr::calculate_total_cols( - ref, mtx.get(), &total_cols, 2, gko::matrix::default_slice_size); - gko::kernels::hip::csr::calculate_total_cols( - hip, dmtx.get(), &dtotal_cols, 2, gko::matrix::default_slice_size); - - ASSERT_EQ(total_cols, dtotal_cols); -} - - TEST_F(Csr, CalculatesNonzerosPerRow) { set_up_apply_data(std::make_shared()); gko::Array row_nnz(ref, mtx->get_size()[0]); gko::Array drow_nnz(hip, dmtx->get_size()[0]); - gko::kernels::reference::csr::calculate_nonzeros_per_row(ref, mtx.get(), - &row_nnz); - gko::kernels::hip::csr::calculate_nonzeros_per_row(hip, dmtx.get(), - &drow_nnz); + gko::kernels::reference::csr::count_nonzeros_per_row(ref, mtx.get(), + row_nnz.get_data()); + gko::kernels::hip::csr::count_nonzeros_per_row(hip, dmtx.get(), + drow_nnz.get_data()); GKO_ASSERT_ARRAY_EQ(row_nnz, drow_nnz); } @@ -720,7 +690,7 @@ TEST_F(Csr, ConvertToHybridIsEquivalentToRef) mtx->convert_to(hybrid_mtx.get()); dmtx->convert_to(dhybrid_mtx.get()); - GKO_ASSERT_MTX_NEAR(hybrid_mtx.get(), dhybrid_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(hybrid_mtx.get(), dhybrid_mtx.get(), 0); } @@ -736,7 +706,7 @@ TEST_F(Csr, MoveToHybridIsEquivalentToRef) mtx->move_to(hybrid_mtx.get()); dmtx->move_to(dhybrid_mtx.get()); - GKO_ASSERT_MTX_NEAR(hybrid_mtx.get(), dhybrid_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(hybrid_mtx.get(), dhybrid_mtx.get(), 0); } diff --git a/hip/test/matrix/dense_kernels.hip.cpp b/hip/test/matrix/dense_kernels.hip.cpp index 26c03a18ff4..e6978a68fa1 100644 --- a/hip/test/matrix/dense_kernels.hip.cpp +++ b/hip/test/matrix/dense_kernels.hip.cpp @@ -388,7 +388,7 @@ TEST_F(Dense, ConvertToCooIsEquivalentToRef) ASSERT_EQ(dcoo_mtx->get_num_stored_elements(), coo_mtx->get_num_stored_elements()); - GKO_ASSERT_MTX_NEAR(dcoo_mtx.get(), coo_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(dcoo_mtx.get(), coo_mtx.get(), 0); } @@ -403,7 +403,7 @@ TEST_F(Dense, MoveToCooIsEquivalentToRef) ASSERT_EQ(dcoo_mtx->get_num_stored_elements(), coo_mtx->get_num_stored_elements()); - GKO_ASSERT_MTX_NEAR(dcoo_mtx.get(), coo_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(dcoo_mtx.get(), coo_mtx.get(), 0); } @@ -416,7 +416,7 @@ TEST_F(Dense, ConvertToCsrIsEquivalentToRef) x->convert_to(csr_mtx.get()); dx->convert_to(dcsr_mtx.get()); - GKO_ASSERT_MTX_NEAR(dcsr_mtx.get(), csr_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(dcsr_mtx.get(), csr_mtx.get(), 0); } @@ -429,7 +429,7 @@ TEST_F(Dense, MoveToCsrIsEquivalentToRef) x->move_to(csr_mtx.get()); dx->move_to(dcsr_mtx.get()); - GKO_ASSERT_MTX_NEAR(dcsr_mtx.get(), csr_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(dcsr_mtx.get(), csr_mtx.get(), 0); } @@ -442,7 +442,7 @@ TEST_F(Dense, ConvertToEllIsEquivalentToRef) x->convert_to(ell_mtx.get()); dx->convert_to(dell_mtx.get()); - GKO_ASSERT_MTX_NEAR(dell_mtx.get(), ell_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(dell_mtx.get(), ell_mtx.get(), 0); } @@ -455,7 +455,7 @@ TEST_F(Dense, MoveToEllIsEquivalentToRef) x->move_to(ell_mtx.get()); dx->move_to(dell_mtx.get()); - GKO_ASSERT_MTX_NEAR(dell_mtx.get(), ell_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(dell_mtx.get(), ell_mtx.get(), 0); } @@ -468,7 +468,7 @@ TEST_F(Dense, ConvertToSellpIsEquivalentToRef) x->convert_to(sellp_mtx.get()); dx->convert_to(dsellp_mtx.get()); - GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, 1e-14); + GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, 0); } @@ -481,7 +481,7 @@ TEST_F(Dense, MoveToSellpIsEquivalentToRef) x->move_to(sellp_mtx.get()); dx->move_to(dsellp_mtx.get()); - GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, 1e-14); + GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, 0); } @@ -497,19 +497,6 @@ TEST_F(Dense, ConvertsEmptyToSellp) } -TEST_F(Dense, CountNNZIsEquivalentToRef) -{ - set_up_apply_data(); - gko::size_type nnz; - gko::size_type dnnz; - - gko::kernels::reference::dense::count_nonzeros(ref, x.get(), &nnz); - gko::kernels::hip::dense::count_nonzeros(hip, dx.get(), &dnnz); - - ASSERT_EQ(nnz, dnnz); -} - - TEST_F(Dense, CalculateNNZPerRowIsEquivalentToRef) { set_up_apply_data(); @@ -518,10 +505,10 @@ TEST_F(Dense, CalculateNNZPerRowIsEquivalentToRef) gko::Array dnnz_per_row(hip); dnnz_per_row.resize_and_reset(dx->get_size()[0]); - gko::kernels::reference::dense::calculate_nonzeros_per_row(ref, x.get(), - &nnz_per_row); - gko::kernels::hip::dense::calculate_nonzeros_per_row(hip, dx.get(), - &dnnz_per_row); + gko::kernels::reference::dense::count_nonzeros_per_row( + ref, x.get(), nnz_per_row.get_data()); + gko::kernels::hip::dense::count_nonzeros_per_row(hip, dx.get(), + dnnz_per_row.get_data()); auto tmp = gko::Array(ref, dnnz_per_row); for (auto i = 0; i < nnz_per_row.get_num_elems(); i++) { @@ -530,36 +517,20 @@ TEST_F(Dense, CalculateNNZPerRowIsEquivalentToRef) } -TEST_F(Dense, CalculateMaxNNZPerRowIsEquivalentToRef) +TEST_F(Dense, ComputeMaxNNZPerRowIsEquivalentToRef) { set_up_apply_data(); gko::size_type max_nnz; gko::size_type dmax_nnz; - gko::kernels::reference::dense::calculate_max_nnz_per_row(ref, x.get(), - &max_nnz); - gko::kernels::hip::dense::calculate_max_nnz_per_row(hip, dx.get(), - &dmax_nnz); + gko::kernels::reference::dense::compute_max_nnz_per_row(ref, x.get(), + max_nnz); + gko::kernels::hip::dense::compute_max_nnz_per_row(hip, dx.get(), dmax_nnz); ASSERT_EQ(max_nnz, dmax_nnz); } -TEST_F(Dense, CalculateTotalColsIsEquivalentToRef) -{ - set_up_apply_data(); - gko::size_type total_cols; - gko::size_type dtotal_cols; - - gko::kernels::reference::dense::calculate_total_cols( - ref, x.get(), &total_cols, 2, gko::matrix::default_slice_size); - gko::kernels::hip::dense::calculate_total_cols( - hip, dx.get(), &dtotal_cols, 2, gko::matrix::default_slice_size); - - ASSERT_EQ(total_cols, dtotal_cols); -} - - TEST_F(Dense, IsTransposable) { set_up_apply_data(); diff --git a/hip/test/matrix/ell_kernels.hip.cpp b/hip/test/matrix/ell_kernels.hip.cpp index 6a1debcee71..7a51207c43e 100644 --- a/hip/test/matrix/ell_kernels.hip.cpp +++ b/hip/test/matrix/ell_kernels.hip.cpp @@ -537,7 +537,7 @@ TEST_F(Ell, ConvertToDenseIsEquivalentToRef) mtx->convert_to(dense_mtx.get()); dmtx->convert_to(ddense_mtx.get()); - GKO_ASSERT_MTX_NEAR(dense_mtx.get(), ddense_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(dense_mtx.get(), ddense_mtx.get(), 0); } @@ -551,7 +551,7 @@ TEST_F(Ell, ConvertToCsrIsEquivalentToRef) mtx->convert_to(csr_mtx.get()); dmtx->convert_to(dcsr_mtx.get()); - GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), 0); } diff --git a/hip/test/matrix/hybrid_kernels.hip.cpp b/hip/test/matrix/hybrid_kernels.hip.cpp index e8a64514f06..58bb01e86eb 100644 --- a/hip/test/matrix/hybrid_kernels.hip.cpp +++ b/hip/test/matrix/hybrid_kernels.hip.cpp @@ -219,7 +219,7 @@ TEST_F(Hybrid, ConvertToCsrIsEquivalentToRef) mtx->convert_to(csr_mtx.get()); dmtx->convert_to(dcsr_mtx.get()); - GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), 0); } @@ -232,7 +232,7 @@ TEST_F(Hybrid, MoveToCsrIsEquivalentToRef) mtx->move_to(csr_mtx.get()); dmtx->move_to(dcsr_mtx.get()); - GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), 0); } diff --git a/hip/test/matrix/sellp_kernels.hip.cpp b/hip/test/matrix/sellp_kernels.hip.cpp index 0677cda2d79..54f10e861f0 100644 --- a/hip/test/matrix/sellp_kernels.hip.cpp +++ b/hip/test/matrix/sellp_kernels.hip.cpp @@ -252,7 +252,7 @@ TEST_F(Sellp, ConvertToDenseIsEquivalentToRef) mtx->convert_to(dense_mtx.get()); dmtx->convert_to(ddense_mtx.get()); - GKO_ASSERT_MTX_NEAR(dense_mtx.get(), ddense_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(dense_mtx.get(), ddense_mtx.get(), 0); } @@ -266,7 +266,7 @@ TEST_F(Sellp, ConvertToCsrIsEquivalentToRef) mtx->convert_to(csr_mtx.get()); dmtx->convert_to(dcsr_mtx.get()); - GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), 0); } @@ -296,19 +296,6 @@ TEST_F(Sellp, ConvertEmptyToCsrIsEquivalentToRef) } -TEST_F(Sellp, CountNonzerosIsEquivalentToRef) -{ - set_up_apply_matrix(64); - gko::size_type nnz; - gko::size_type dnnz; - - gko::kernels::reference::sellp::count_nonzeros(ref, mtx.get(), &nnz); - gko::kernels::hip::sellp::count_nonzeros(hip, dmtx.get(), &dnnz); - - ASSERT_EQ(nnz, dnnz); -} - - TEST_F(Sellp, ExtractDiagonalIsEquivalentToRef) { set_up_apply_matrix(64); diff --git a/include/ginkgo/core/matrix/coo.hpp b/include/ginkgo/core/matrix/coo.hpp index 8a03dba6942..8bda4e728c8 100644 --- a/include/ginkgo/core/matrix/coo.hpp +++ b/include/ginkgo/core/matrix/coo.hpp @@ -50,14 +50,15 @@ namespace matrix { template class Csr; - template class Dense; - template class CooBuilder; +template +class Hybrid; + /** * COO stores a matrix in the coordinate matrix format. @@ -90,6 +91,7 @@ class Coo : public EnableLinOp>, friend class Dense; friend class CooBuilder; friend class Coo, IndexType>; + friend class Hybrid; public: using EnableLinOp::convert_to; @@ -336,6 +338,8 @@ class Coo : public EnableLinOp>, GKO_ASSERT_EQ(values_.get_num_elems(), row_idxs_.get_num_elems()); } + void resize(dim<2> new_size, size_type nnz); + void apply_impl(const LinOp* b, LinOp* x) const override; void apply_impl(const LinOp* alpha, const LinOp* b, const LinOp* beta, diff --git a/include/ginkgo/core/matrix/csr.hpp b/include/ginkgo/core/matrix/csr.hpp index 160e6d9e0e8..40bd0ec418a 100644 --- a/include/ginkgo/core/matrix/csr.hpp +++ b/include/ginkgo/core/matrix/csr.hpp @@ -46,6 +46,9 @@ namespace matrix { template class Dense; +template +class Diagonal; + template class Coo; @@ -140,6 +143,7 @@ class Csr : public EnableLinOp>, friend class EnablePolymorphicObject; friend class Coo; friend class Dense; + friend class Diagonal; friend class Ell; friend class Hybrid; friend class Sellp; @@ -1033,8 +1037,6 @@ class Csr : public EnableLinOp>, this->make_srow(); } - void resize(gko::dim<2> new_size, size_type nnz); - void apply_impl(const LinOp* b, LinOp* x) const override; void apply_impl(const LinOp* alpha, const LinOp* b, const LinOp* beta, diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp index 7b82fcb4e2f..c36e62b9a87 100644 --- a/include/ginkgo/core/matrix/dense.hpp +++ b/include/ginkgo/core/matrix/dense.hpp @@ -930,6 +930,24 @@ class Dense return Dense::create(exec, size, stride); } + template + void convert_impl(Coo* result) const; + + template + void convert_impl(Csr* result) const; + + template + void convert_impl(Ell* result) const; + + template + void convert_impl(Hybrid* result) const; + + template + void convert_impl(Sellp* result) const; + + template + void convert_impl(SparsityCsr* result) const; + /** * @copydoc scale(const LinOp *) * diff --git a/include/ginkgo/core/matrix/ell.hpp b/include/ginkgo/core/matrix/ell.hpp index c3f1d4674c5..5c82df72de1 100644 --- a/include/ginkgo/core/matrix/ell.hpp +++ b/include/ginkgo/core/matrix/ell.hpp @@ -45,9 +45,15 @@ namespace matrix { template class Dense; +template +class Coo; + template class Csr; +template +class Hybrid; + /** * ELL is a matrix format where stride with explicit zeros is used such that @@ -81,8 +87,11 @@ class Ell : public EnableLinOp>, friend class EnableCreateMethod; friend class EnablePolymorphicObject; friend class Dense; + friend class Coo; friend class Csr; friend class Ell, IndexType>; + friend class Ell, IndexType>; + friend class Hybrid; public: using EnableLinOp::convert_to; @@ -95,8 +104,6 @@ class Ell : public EnableLinOp>, using device_mat_data = device_matrix_data; using absolute_type = remove_complex; - friend class Ell, IndexType>; - void convert_to( Ell, IndexType>* result) const override; @@ -343,6 +350,8 @@ class Ell : public EnableLinOp>, col_idxs_.get_num_elems()); } + void resize(dim<2> new_size, size_type max_row_nnz); + void apply_impl(const LinOp* b, LinOp* x) const override; void apply_impl(const LinOp* alpha, const LinOp* b, const LinOp* beta, diff --git a/include/ginkgo/core/matrix/hybrid.hpp b/include/ginkgo/core/matrix/hybrid.hpp index 55c5c9c9e68..f7fd81e8b08 100644 --- a/include/ginkgo/core/matrix/hybrid.hpp +++ b/include/ginkgo/core/matrix/hybrid.hpp @@ -736,6 +736,8 @@ class Hybrid strategy_(std::move(strategy)) {} + void resize(dim<2> new_size, size_type ell_row_nnz, size_type coo_nnz); + void apply_impl(const LinOp* b, LinOp* x) const override; void apply_impl(const LinOp* alpha, const LinOp* b, const LinOp* beta, diff --git a/include/ginkgo/core/matrix/sparsity_csr.hpp b/include/ginkgo/core/matrix/sparsity_csr.hpp index 0522c1e4dff..a2ff278b50b 100644 --- a/include/ginkgo/core/matrix/sparsity_csr.hpp +++ b/include/ginkgo/core/matrix/sparsity_csr.hpp @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include namespace gko { @@ -49,6 +50,10 @@ template class Csr; +template +class Dense; + + template class Fbcsr; @@ -75,12 +80,15 @@ template class SparsityCsr : public EnableLinOp>, public EnableCreateMethod>, + public ConvertibleTo>, + public ConvertibleTo>, public ReadableFromMatrixData, public WritableToMatrixData, public Transposable { friend class EnableCreateMethod; friend class EnablePolymorphicObject; friend class Csr; + friend class Dense; friend class Fbcsr; public: @@ -94,6 +102,14 @@ class SparsityCsr using mat_data = matrix_data; using device_mat_data = device_matrix_data; + void convert_to(Csr* result) const override; + + void move_to(Csr* result) override; + + void convert_to(Dense* result) const override; + + void move_to(Dense* result) override; + void read(const mat_data& data) override; void read(const device_mat_data& data) override; diff --git a/omp/matrix/csr_kernels.cpp b/omp/matrix/csr_kernels.cpp index 0e2091cb8ee..6d31fae06e2 100644 --- a/omp/matrix/csr_kernels.cpp +++ b/omp/matrix/csr_kernels.cpp @@ -566,9 +566,6 @@ void fill_in_dense(std::shared_ptr exec, #pragma omp parallel for for (size_type row = 0; row < num_rows; ++row) { - for (size_type col = 0; col < num_cols; ++col) { - result->at(row, col) = zero(); - } for (size_type i = row_ptrs[row]; i < static_cast(row_ptrs[row + 1]); ++i) { result->at(row, col_idxs[i]) = vals[i]; @@ -580,26 +577,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL); -template -void convert_to_sellp(std::shared_ptr exec, - const matrix::Csr* source, - matrix::Sellp* result) - GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CONVERT_TO_SELLP_KERNEL); - - -template -void convert_to_ell(std::shared_ptr exec, - const matrix::Csr* source, - matrix::Ell* result) - GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CONVERT_TO_ELL_KERNEL); - - template inline void convert_csr_to_csc(size_type num_rows, const IndexType* row_ptrs, const IndexType* col_idxs, @@ -671,25 +648,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL); -template -void calculate_total_cols(std::shared_ptr exec, - const matrix::Csr* source, - size_type* result, size_type stride_factor, - size_type slice_size) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CALCULATE_TOTAL_COLS_KERNEL); - - -template -void calculate_max_nnz_per_row(std::shared_ptr exec, - const matrix::Csr* source, - size_type* result) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL); - - template void calculate_nonzeros_per_row_in_span( std::shared_ptr exec, @@ -747,82 +705,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL); -template -void convert_to_hybrid(std::shared_ptr exec, - const matrix::Csr* source, - matrix::Hybrid* result) -{ - auto num_rows = result->get_size()[0]; - auto num_cols = result->get_size()[1]; - auto strategy = result->get_strategy(); - auto ell_lim = strategy->get_ell_num_stored_elements_per_row(); - auto coo_lim = strategy->get_coo_nnz(); - auto coo_val = result->get_coo_values(); - auto coo_col = result->get_coo_col_idxs(); - auto coo_row = result->get_coo_row_idxs(); - const auto max_nnz_per_row = result->get_ell_num_stored_elements_per_row(); - -// Initial Hybrid Matrix -#pragma omp parallel for - for (size_type i = 0; i < max_nnz_per_row; i++) { - for (size_type j = 0; j < result->get_ell_stride(); j++) { - result->ell_val_at(j, i) = zero(); - result->ell_col_at(j, i) = 0; - } - } - - const auto csr_row_ptrs = source->get_const_row_ptrs(); - const auto csr_vals = source->get_const_values(); - auto coo_offset = Array(exec, num_rows); - auto coo_offset_val = coo_offset.get_data(); - - coo_offset_val[0] = 0; -#pragma omp parallel for - for (size_type i = 1; i < num_rows; i++) { - auto temp = csr_row_ptrs[i] - csr_row_ptrs[i - 1]; - coo_offset_val[i] = (temp > max_nnz_per_row) * (temp - max_nnz_per_row); - } - - auto workspace = Array(exec, num_rows); - auto workspace_val = workspace.get_data(); - for (size_type i = 1; i < num_rows; i <<= 1) { -#pragma omp parallel for - for (size_type j = i; j < num_rows; j++) { - workspace_val[j] = coo_offset_val[j] + coo_offset_val[j - i]; - } -#pragma omp parallel for - for (size_type j = i; j < num_rows; j++) { - coo_offset_val[j] = workspace_val[j]; - } - } - -#pragma omp parallel for - for (IndexType row = 0; row < num_rows; row++) { - size_type ell_idx = 0; - size_type csr_idx = csr_row_ptrs[row]; - size_type coo_idx = coo_offset_val[row]; - while (csr_idx < csr_row_ptrs[row + 1]) { - const auto val = csr_vals[csr_idx]; - if (ell_idx < ell_lim) { - result->ell_val_at(row, ell_idx) = val; - result->ell_col_at(row, ell_idx) = - source->get_const_col_idxs()[csr_idx]; - ell_idx++; - } else { - coo_val[coo_idx] = val; - coo_col[coo_idx] = source->get_const_col_idxs()[csr_idx]; - coo_row[coo_idx] = row; - coo_idx++; - } - csr_idx++; - } - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL); - - template void inv_symm_permute(std::shared_ptr exec, const IndexType* perm, @@ -939,24 +821,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL); -template -void calculate_nonzeros_per_row(std::shared_ptr exec, - const matrix::Csr* source, - Array* result) -{ - const auto row_ptrs = source->get_const_row_ptrs(); - auto row_nnz_val = result->get_data(); - -#pragma omp parallel for - for (size_type i = 0; i < result->get_num_elems(); i++) { - row_nnz_val[i] = row_ptrs[i + 1] - row_ptrs[i]; - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CALCULATE_NONZEROS_PER_ROW_KERNEL); - - template void sort_by_column_index(std::shared_ptr exec, matrix::Csr* to_sort) diff --git a/omp/matrix/dense_kernels.cpp b/omp/matrix/dense_kernels.cpp index 086fa4afbf5..431d610b7ca 100644 --- a/omp/matrix/dense_kernels.cpp +++ b/omp/matrix/dense_kernels.cpp @@ -97,7 +97,7 @@ void apply(std::shared_ptr exec, const matrix::Dense* a, const matrix::Dense* b, const matrix::Dense* beta, matrix::Dense* c) { - if (beta->at(0, 0) != zero()) { + if (is_nonzero(beta->at(0, 0))) { #pragma omp parallel for for (size_type row = 0; row < c->get_size()[0]; ++row) { for (size_type col = 0; col < c->get_size()[1]; ++col) { @@ -130,6 +130,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL); template void convert_to_coo(std::shared_ptr exec, const matrix::Dense* source, + const int64* row_ptrs, matrix::Coo* result) { auto num_rows = result->get_size()[0]; @@ -139,27 +140,13 @@ void convert_to_coo(std::shared_ptr exec, auto row_idxs = result->get_row_idxs(); auto col_idxs = result->get_col_idxs(); auto values = result->get_values(); - Array row_ptrs_array(exec, num_rows); - auto row_ptrs = row_ptrs_array.get_data(); - -#pragma omp parallel for - for (size_type row = 0; row < num_rows; ++row) { - IndexType row_count{}; - for (size_type col = 0; col < num_cols; ++col) { - auto val = source->at(row, col); - row_count += val != zero(); - } - row_ptrs[row] = row_count; - } - - components::prefix_sum(exec, row_ptrs, num_rows); #pragma omp parallel for for (size_type row = 0; row < num_rows; ++row) { auto idxs = row_ptrs[row]; for (size_type col = 0; col < num_cols; ++col) { auto val = source->at(row, col); - if (val != zero()) { + if (is_nonzero(val)) { row_idxs[idxs] = row; col_idxs[idxs] = col; values[idxs] = val; @@ -186,24 +173,12 @@ void convert_to_csr(std::shared_ptr exec, auto col_idxs = result->get_col_idxs(); auto values = result->get_values(); -#pragma omp parallel for - for (size_type row = 0; row < num_rows; ++row) { - IndexType row_nnz{}; - for (size_type col = 0; col < num_cols; ++col) { - auto val = source->at(row, col); - row_nnz += val != zero(); - } - row_ptrs[row] = row_nnz; - } - - components::prefix_sum(exec, row_ptrs, num_rows + 1); - #pragma omp parallel for for (size_type row = 0; row < num_rows; ++row) { auto cur_ptr = row_ptrs[row]; for (size_type col = 0; col < num_cols; ++col) { auto val = source->at(row, col); - if (val != zero()) { + if (is_nonzero(val)) { col_idxs[cur_ptr] = col; values[cur_ptr] = val; ++cur_ptr; @@ -236,7 +211,7 @@ void convert_to_ell(std::shared_ptr exec, size_type col_idx = 0; for (size_type col = 0; col < num_cols; col++) { auto val = source->at(row, col); - if (val != zero()) { + if (is_nonzero(val)) { result->val_at(row, col_idx) = val; result->col_at(row, col_idx) = col; col_idx++; @@ -252,6 +227,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void convert_to_hybrid(std::shared_ptr exec, const matrix::Dense* source, + const int64* coo_row_ptrs, matrix::Hybrid* result) { auto num_rows = result->get_size()[0]; @@ -261,35 +237,6 @@ void convert_to_hybrid(std::shared_ptr exec, auto coo_val = result->get_coo_values(); auto coo_col = result->get_coo_col_idxs(); auto coo_row = result->get_coo_row_idxs(); - Array coo_row_ptrs_array(exec, num_rows); - auto coo_row_ptrs = coo_row_ptrs_array.get_data(); - - auto ell_nnz_row = result->get_ell_num_stored_elements_per_row(); - auto ell_stride = result->get_ell_stride(); -#pragma omp parallel for collapse(2) - for (size_type i = 0; i < ell_nnz_row; i++) { - for (size_type j = 0; j < ell_stride; j++) { - result->ell_val_at(j, i) = zero(); - result->ell_col_at(j, i) = 0; - } - } -#pragma omp parallel for - for (size_type i = 0; i < result->get_coo_num_stored_elements(); i++) { - coo_val[i] = zero(); - coo_col[i] = 0; - coo_row[i] = 0; - } -#pragma omp parallel for - for (size_type row = 0; row < num_rows; row++) { - size_type total_row_nnz{}; - for (size_type col = 0; col < num_cols; col++) { - auto val = source->at(row, col); - total_row_nnz += val != zero(); - } - coo_row_ptrs[row] = std::max(ell_lim, total_row_nnz) - ell_lim; - } - - components::prefix_sum(exec, coo_row_ptrs, num_rows); #pragma omp parallel for for (size_type row = 0; row < num_rows; row++) { @@ -297,16 +244,20 @@ void convert_to_hybrid(std::shared_ptr exec, size_type col = 0; for (; col < num_cols && ell_count < ell_lim; col++) { auto val = source->at(row, col); - if (val != zero()) { + if (is_nonzero(val)) { result->ell_val_at(row, ell_count) = val; result->ell_col_at(row, ell_count) = col; ell_count++; } } + for (; ell_count < ell_lim; ell_count++) { + result->ell_val_at(row, ell_count) = zero(); + result->ell_col_at(row, ell_count) = 0; + } auto coo_idx = coo_row_ptrs[row]; for (; col < num_cols; col++) { auto val = source->at(row, col); - if (val != zero()) { + if (is_nonzero(val)) { coo_val[coo_idx] = val; coo_col[coo_idx] = col; coo_row[coo_idx] = row; @@ -325,70 +276,37 @@ void convert_to_sellp(std::shared_ptr exec, const matrix::Dense* source, matrix::Sellp* result) { - auto num_rows = result->get_size()[0]; - auto num_cols = result->get_size()[1]; - auto vals = result->get_values(); - auto col_idxs = result->get_col_idxs(); - auto slice_lengths = result->get_slice_lengths(); - auto slice_sets = result->get_slice_sets(); - auto slice_size = (result->get_slice_size() == 0) - ? matrix::default_slice_size - : result->get_slice_size(); - auto stride_factor = (result->get_stride_factor() == 0) - ? matrix::default_stride_factor - : result->get_stride_factor(); - int slice_num = ceildiv(num_rows, slice_size); - slice_sets[0] = 0; - for (size_type slice = 0; slice < slice_num; slice++) { - if (slice > 0) { - slice_sets[slice] = - slice_sets[slice - 1] + slice_lengths[slice - 1]; - } - size_type current_slice_length = 0; -#pragma omp parallel for reduction(max : current_slice_length) - for (size_type row = 0; row < slice_size; row++) { - size_type global_row = slice * slice_size + row; - if (global_row < num_rows) { - size_type max_col = 0; - for (size_type col = 0; col < num_cols; col++) { - if (source->at(global_row, col) != zero()) { - max_col += 1; - } - } - current_slice_length = std::max(current_slice_length, max_col); - } - } - slice_lengths[slice] = - stride_factor * ceildiv(current_slice_length, stride_factor); + const auto num_rows = result->get_size()[0]; + const auto num_cols = result->get_size()[1]; + const auto vals = result->get_values(); + const auto col_idxs = result->get_col_idxs(); + const auto slice_sets = result->get_slice_sets(); + const auto slice_size = result->get_slice_size(); + const auto num_slices = ceildiv(num_rows, slice_size); #pragma omp parallel for - for (size_type row = 0; row < slice_size; row++) { - const size_type global_row = slice * slice_size + row; - if (global_row < num_rows) { - size_type sellp_ind = slice_sets[slice] * slice_size + row; - for (size_type col = 0; col < num_cols; col++) { - auto val = source->at(global_row, col); - if (val != zero()) { - col_idxs[sellp_ind] = col; - vals[sellp_ind] = val; - sellp_ind += slice_size; - } - } - for (size_type i = sellp_ind; - i < - (slice_sets[slice] + slice_lengths[slice]) * slice_size + - row; - i += slice_size) { - col_idxs[i] = 0; - vals[i] = 0; + for (size_type slice = 0; slice < num_slices; slice++) { + for (size_type local_row = 0; local_row < slice_size; local_row++) { + const auto row = slice * slice_size + local_row; + if (row >= num_rows) { + break; + } + auto sellp_idx = slice_sets[slice] * slice_size + local_row; + const auto sellp_end = + slice_sets[slice + 1] * slice_size + local_row; + for (size_type col = 0; col < num_cols; col++) { + auto val = source->at(row, col); + if (is_nonzero(val)) { + col_idxs[sellp_idx] = col; + vals[sellp_idx] = val; + sellp_idx += slice_size; } } + for (; sellp_idx < sellp_end; sellp_idx += slice_size) { + col_idxs[sellp_idx] = 0; + vals[sellp_idx] = zero(); + } } } - - if (slice_num > 0) { - slice_sets[slice_num] = - slice_sets[slice_num - 1] + slice_lengths[slice_num - 1]; - } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -408,24 +326,12 @@ void convert_to_sparsity_csr(std::shared_ptr exec, auto value = result->get_value(); value[0] = one(); -#pragma omp parallel for - for (size_type row = 0; row < num_rows; ++row) { - IndexType row_nnz{}; - for (size_type col = 0; col < num_cols; ++col) { - auto val = source->at(row, col); - row_nnz += val != zero(); - } - row_ptrs[row] = row_nnz; - } - - components::prefix_sum(exec, row_ptrs, num_rows + 1); - #pragma omp parallel for for (size_type row = 0; row < num_rows; ++row) { auto cur_ptr = row_ptrs[row]; for (size_type col = 0; col < num_cols; ++col) { auto val = source->at(row, col); - if (val != zero()) { + if (is_nonzero(val)) { col_idxs[cur_ptr] = col; ++cur_ptr; } @@ -437,105 +343,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DENSE_CONVERT_TO_SPARSITY_CSR_KERNEL); -template -void count_nonzeros(std::shared_ptr exec, - const matrix::Dense* source, size_type* result) -{ - auto num_rows = source->get_size()[0]; - auto num_cols = source->get_size()[1]; - auto num_nonzeros = 0; - -#pragma omp parallel for reduction(+ : num_nonzeros) - for (size_type row = 0; row < num_rows; ++row) { - for (size_type col = 0; col < num_cols; ++col) { - num_nonzeros += (source->at(row, col) != zero()); - } - } - - *result = num_nonzeros; -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COUNT_NONZEROS_KERNEL); - - -template -void calculate_max_nnz_per_row(std::shared_ptr exec, - const matrix::Dense* source, - size_type* result) -{ - const auto num_rows = source->get_size()[0]; - const auto num_cols = source->get_size()[1]; - size_type max_nonzeros_per_row = 0; -#pragma omp parallel for reduction(max : max_nonzeros_per_row) - for (size_type row = 0; row < num_rows; ++row) { - size_type num_nonzeros = 0; - for (size_type col = 0; col < num_cols; ++col) { - num_nonzeros += (source->at(row, col) != zero()); - } - max_nonzeros_per_row = std::max(num_nonzeros, max_nonzeros_per_row); - } - *result = max_nonzeros_per_row; -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_CALCULATE_MAX_NNZ_PER_ROW_KERNEL); - - -template -void calculate_nonzeros_per_row(std::shared_ptr exec, - const matrix::Dense* source, - Array* result) -{ - auto num_rows = source->get_size()[0]; - auto num_cols = source->get_size()[1]; - auto row_nnz_val = result->get_data(); -#pragma omp parallel for - for (size_type row = 0; row < num_rows; ++row) { - size_type num_nonzeros = 0; - for (size_type col = 0; col < num_cols; ++col) { - num_nonzeros += (source->at(row, col) != zero()); - } - row_nnz_val[row] = num_nonzeros; - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_CALCULATE_NONZEROS_PER_ROW_KERNEL); - - -template -void calculate_total_cols(std::shared_ptr exec, - const matrix::Dense* source, - size_type* result, size_type stride_factor, - size_type slice_size) -{ - auto num_rows = source->get_size()[0]; - auto num_cols = source->get_size()[1]; - auto slice_num = ceildiv(num_rows, slice_size); - size_type total_cols = 0; -#pragma omp parallel for reduction(+ : total_cols) - for (size_type slice = 0; slice < slice_num; slice++) { - size_type slice_temp = 0; - for (size_type row = 0; - row < slice_size && row + slice * slice_size < num_rows; row++) { - size_type temp = 0; - for (size_type col = 0; col < num_cols; col++) { - temp += (source->at(row + slice * slice_size, col) != - zero()); - } - slice_temp = (slice_temp < temp) ? temp : slice_temp; - } - slice_temp = ceildiv(slice_temp, stride_factor) * stride_factor; - total_cols += slice_temp; - } - - *result = total_cols; -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_CALCULATE_TOTAL_COLS_KERNEL); - - template void transpose(std::shared_ptr exec, const matrix::Dense* orig, diff --git a/omp/matrix/ell_kernels.cpp b/omp/matrix/ell_kernels.cpp index 528c502fa10..6999a3eb880 100644 --- a/omp/matrix/ell_kernels.cpp +++ b/omp/matrix/ell_kernels.cpp @@ -258,32 +258,6 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL); -template -void extract_diagonal(std::shared_ptr exec, - const matrix::Ell* orig, - matrix::Diagonal* diag) -{ - const auto col_idxs = orig->get_const_col_idxs(); - const auto values = orig->get_const_values(); - const auto diag_size = diag->get_size()[0]; - const auto max_nnz_per_row = orig->get_num_stored_elements_per_row(); - auto diag_values = diag->get_values(); - -#pragma omp parallel for - for (size_type row = 0; row < diag_size; row++) { - for (size_type i = 0; i < max_nnz_per_row; i++) { - if (orig->col_at(row, i) == row) { - diag_values[row] = orig->val_at(row, i); - break; - } - } - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_ELL_EXTRACT_DIAGONAL_KERNEL); - - } // namespace ell } // namespace omp } // namespace kernels diff --git a/omp/matrix/fbcsr_kernels.cpp b/omp/matrix/fbcsr_kernels.cpp index aa43f668c43..1cffc0334aa 100644 --- a/omp/matrix/fbcsr_kernels.cpp +++ b/omp/matrix/fbcsr_kernels.cpp @@ -316,53 +316,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_FBCSR_CONJ_TRANSPOSE_KERNEL); -template -void calculate_max_nnz_per_row( - std::shared_ptr exec, - const matrix::Fbcsr* const source, - size_type* const result) -{ - const auto num_rows = source->get_size()[0]; - const auto row_ptrs = source->get_const_row_ptrs(); - const int bs = source->get_block_size(); - IndexType max_nnz = 0; - -#pragma omp parallel for reduction(max : max_nnz) - for (size_type i = 0; i < num_rows; i++) { - const size_type ibrow = i / bs; - max_nnz = - std::max((row_ptrs[ibrow + 1] - row_ptrs[ibrow]) * bs, max_nnz); - } - - *result = max_nnz; -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_FBCSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL); - - -template -void calculate_nonzeros_per_row( - std::shared_ptr exec, - const matrix::Fbcsr* const source, - Array* const result) -{ - const auto row_ptrs = source->get_const_row_ptrs(); - auto row_nnz_val = result->get_data(); - const int bs = source->get_block_size(); - assert(result->get_num_elems() == source->get_size()[0]); - -#pragma omp parallel for - for (size_type i = 0; i < result->get_num_elems(); i++) { - const size_type ibrow = i / bs; - row_nnz_val[i] = (row_ptrs[ibrow + 1] - row_ptrs[ibrow]) * bs; - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_FBCSR_CALCULATE_NONZEROS_PER_ROW_KERNEL); - - template void is_sorted_by_column_index( std::shared_ptr exec, diff --git a/omp/matrix/sellp_kernels.cpp b/omp/matrix/sellp_kernels.cpp index 53fff2c522a..3ff795c2d89 100644 --- a/omp/matrix/sellp_kernels.cpp +++ b/omp/matrix/sellp_kernels.cpp @@ -222,65 +222,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_SELLP_ADVANCED_SPMV_KERNEL); -template -void convert_to_csr(std::shared_ptr exec, - const matrix::Sellp* source, - matrix::Csr* result) - GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_SELLP_CONVERT_TO_CSR_KERNEL); - - -template -void count_nonzeros(std::shared_ptr exec, - const matrix::Sellp* source, - size_type* result) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_SELLP_COUNT_NONZEROS_KERNEL); - - -template -void extract_diagonal(std::shared_ptr exec, - const matrix::Sellp* orig, - matrix::Diagonal* diag) -{ - const auto diag_size = diag->get_size()[0]; - const auto slice_size = orig->get_slice_size(); - const auto slice_num = ceildiv(orig->get_size()[0], slice_size); - - const auto orig_values = orig->get_const_values(); - const auto orig_slice_sets = orig->get_const_slice_sets(); - const auto orig_slice_lengths = orig->get_const_slice_lengths(); - const auto orig_col_idxs = orig->get_const_col_idxs(); - auto diag_values = diag->get_values(); - -#pragma omp parallel for - for (size_type slice = 0; slice < slice_num; slice++) { - for (size_type row = 0; row < slice_size; row++) { - auto global_row = slice_size * slice + row; - if (global_row >= diag_size) { - break; - } - for (size_type i = 0; i < orig_slice_lengths[slice]; i++) { - if (orig->col_at(row, orig_slice_sets[slice], i) == - global_row && - orig->val_at(row, orig_slice_sets[slice], i) != - zero()) { - diag_values[global_row] = - orig->val_at(row, orig_slice_sets[slice], i); - break; - } - } - } - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_SELLP_EXTRACT_DIAGONAL_KERNEL); - - } // namespace sellp } // namespace omp } // namespace kernels diff --git a/omp/matrix/sparsity_csr_kernels.cpp b/omp/matrix/sparsity_csr_kernels.cpp index 36a4520c1f8..172c4201760 100644 --- a/omp/matrix/sparsity_csr_kernels.cpp +++ b/omp/matrix/sparsity_csr_kernels.cpp @@ -123,6 +123,29 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL); +template +void fill_in_dense(std::shared_ptr exec, + const matrix::SparsityCsr* input, + matrix::Dense* output) +{ + auto row_ptrs = input->get_const_row_ptrs(); + auto col_idxs = input->get_const_col_idxs(); + auto val = input->get_const_value()[0]; + const auto num_rows = input->get_size()[0]; + +#pragma omp parallel for + for (size_type row = 0; row < num_rows; ++row) { + for (auto k = row_ptrs[row]; k < row_ptrs[row + 1]; ++k) { + auto col = col_idxs[k]; + output->at(row, col) = val; + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_SPARSITY_CSR_FILL_IN_DENSE_KERNEL); + + template void count_num_diagonal_elements( std::shared_ptr exec, diff --git a/omp/test/matrix/coo_kernels.cpp b/omp/test/matrix/coo_kernels.cpp index b8ed8760037..998ea2c1ac7 100644 --- a/omp/test/matrix/coo_kernels.cpp +++ b/omp/test/matrix/coo_kernels.cpp @@ -325,7 +325,7 @@ TEST_F(Coo, ConvertToCsrIsEquivalentToRef) mtx->convert_to(csr_mtx.get()); dmtx->convert_to(dcsr_mtx.get()); - GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), 0); } diff --git a/omp/test/matrix/csr_kernels.cpp b/omp/test/matrix/csr_kernels.cpp index 48cbcb9e53c..c82392c156b 100644 --- a/omp/test/matrix/csr_kernels.cpp +++ b/omp/test/matrix/csr_kernels.cpp @@ -410,7 +410,7 @@ TEST_F(Csr, ConvertToCooIsEquivalentToRef) mtx->convert_to(coo_mtx.get()); dmtx->convert_to(dcoo_mtx.get()); - GKO_ASSERT_MTX_NEAR(coo_mtx.get(), dcoo_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(coo_mtx.get(), dcoo_mtx.get(), 0); } @@ -423,7 +423,7 @@ TEST_F(Csr, MoveToCooIsEquivalentToRef) mtx->move_to(coo_mtx.get()); dmtx->move_to(dcoo_mtx.get()); - GKO_ASSERT_MTX_NEAR(coo_mtx.get(), dcoo_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(coo_mtx.get(), dcoo_mtx.get(), 0); } @@ -436,7 +436,7 @@ TEST_F(Csr, ConvertToDenseIsEquivalentToRef) mtx->convert_to(dense_mtx.get()); dmtx->convert_to(ddense_mtx.get()); - GKO_ASSERT_MTX_NEAR(ddense_mtx.get(), dense_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(ddense_mtx.get(), dense_mtx.get(), 0); } @@ -449,7 +449,7 @@ TEST_F(Csr, MoveToDenseIsEquivalentToRef) mtx->move_to(dense_mtx.get()); dmtx->move_to(ddense_mtx.get()); - GKO_ASSERT_MTX_NEAR(ddense_mtx.get(), dense_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(ddense_mtx.get(), dense_mtx.get(), 0); } @@ -462,7 +462,7 @@ TEST_F(Csr, ConvertToSparsityCsrIsEquivalentToRef) mtx->convert_to(sparsity_mtx.get()); dmtx->convert_to(d_sparsity_mtx.get()); - GKO_ASSERT_MTX_NEAR(d_sparsity_mtx.get(), sparsity_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(d_sparsity_mtx.get(), sparsity_mtx.get(), 0); } @@ -475,7 +475,7 @@ TEST_F(Csr, MoveToSparsityCsrIsEquivalentToRef) mtx->move_to(sparsity_mtx.get()); dmtx->move_to(d_sparsity_mtx.get()); - GKO_ASSERT_MTX_NEAR(d_sparsity_mtx.get(), sparsity_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(d_sparsity_mtx.get(), sparsity_mtx.get(), 0); } @@ -485,10 +485,10 @@ TEST_F(Csr, CalculatesNonzerosPerRow) gko::Array row_nnz(ref, mtx->get_size()[0]); gko::Array drow_nnz(omp, dmtx->get_size()[0]); - gko::kernels::reference::csr::calculate_nonzeros_per_row(ref, mtx.get(), - &row_nnz); - gko::kernels::omp::csr::calculate_nonzeros_per_row(omp, dmtx.get(), - &drow_nnz); + gko::kernels::reference::csr::count_nonzeros_per_row(ref, mtx.get(), + row_nnz.get_data()); + gko::kernels::omp::csr::count_nonzeros_per_row(omp, dmtx.get(), + drow_nnz.get_data()); GKO_ASSERT_ARRAY_EQ(row_nnz, drow_nnz); } @@ -506,7 +506,7 @@ TEST_F(Csr, ConvertToHybridIsEquivalentToRef) mtx->convert_to(hybrid_mtx.get()); dmtx->convert_to(dhybrid_mtx.get()); - GKO_ASSERT_MTX_NEAR(hybrid_mtx.get(), dhybrid_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(hybrid_mtx.get(), dhybrid_mtx.get(), 0); } @@ -522,7 +522,7 @@ TEST_F(Csr, MoveToHybridIsEquivalentToRef) mtx->move_to(hybrid_mtx.get()); dmtx->move_to(dhybrid_mtx.get()); - GKO_ASSERT_MTX_NEAR(hybrid_mtx.get(), dhybrid_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(hybrid_mtx.get(), dhybrid_mtx.get(), 0); } diff --git a/omp/test/matrix/dense_kernels.cpp b/omp/test/matrix/dense_kernels.cpp index 3ed76d25e60..2e3164d5ef3 100644 --- a/omp/test/matrix/dense_kernels.cpp +++ b/omp/test/matrix/dense_kernels.cpp @@ -409,9 +409,9 @@ TEST_F(Dense, ConvertToCooIsEquivalentToRef) srmtx->convert_to(drmtx.get()); somtx->convert_to(domtx.get()); - GKO_ASSERT_MTX_NEAR(drmtx, domtx, 1e-14); - GKO_ASSERT_MTX_NEAR(srmtx, somtx, 1e-14); - GKO_ASSERT_MTX_NEAR(domtx, omtx, 1e-14); + GKO_ASSERT_MTX_NEAR(drmtx, domtx, 0); + GKO_ASSERT_MTX_NEAR(srmtx, somtx, 0); + GKO_ASSERT_MTX_NEAR(domtx, omtx, 0); } @@ -429,9 +429,9 @@ TEST_F(Dense, MoveToCooIsEquivalentToRef) srmtx->move_to(drmtx.get()); somtx->move_to(domtx.get()); - GKO_ASSERT_MTX_NEAR(drmtx, domtx, 1e-14); - GKO_ASSERT_MTX_NEAR(srmtx, somtx, 1e-14); - GKO_ASSERT_MTX_NEAR(domtx, omtx, 1e-14); + GKO_ASSERT_MTX_NEAR(drmtx, domtx, 0); + GKO_ASSERT_MTX_NEAR(srmtx, somtx, 0); + GKO_ASSERT_MTX_NEAR(domtx, omtx, 0); } @@ -449,9 +449,9 @@ TEST_F(Dense, ConvertToCsrIsEquivalentToRef) srmtx->convert_to(drmtx.get()); somtx->convert_to(domtx.get()); - GKO_ASSERT_MTX_NEAR(drmtx, domtx, 1e-14); - GKO_ASSERT_MTX_NEAR(srmtx, somtx, 1e-14); - GKO_ASSERT_MTX_NEAR(domtx, omtx, 1e-14); + GKO_ASSERT_MTX_NEAR(drmtx, domtx, 0); + GKO_ASSERT_MTX_NEAR(srmtx, somtx, 0); + GKO_ASSERT_MTX_NEAR(domtx, omtx, 0); } @@ -469,9 +469,9 @@ TEST_F(Dense, MoveToCsrIsEquivalentToRef) srmtx->move_to(drmtx.get()); somtx->move_to(domtx.get()); - GKO_ASSERT_MTX_NEAR(drmtx, domtx, 1e-14); - GKO_ASSERT_MTX_NEAR(srmtx, somtx, 1e-14); - GKO_ASSERT_MTX_NEAR(domtx, omtx, 1e-14); + GKO_ASSERT_MTX_NEAR(drmtx, domtx, 0); + GKO_ASSERT_MTX_NEAR(srmtx, somtx, 0); + GKO_ASSERT_MTX_NEAR(domtx, omtx, 0); } @@ -485,7 +485,7 @@ TEST_F(Dense, ConvertToSparsityCsrIsEquivalentToRef) mtx->convert_to(sparsity_mtx.get()); dmtx->convert_to(d_sparsity_mtx.get()); - GKO_ASSERT_MTX_NEAR(d_sparsity_mtx.get(), sparsity_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(d_sparsity_mtx.get(), sparsity_mtx.get(), 0); } @@ -499,7 +499,7 @@ TEST_F(Dense, MoveToSparsityCsrIsEquivalentToRef) mtx->move_to(sparsity_mtx.get()); dmtx->move_to(d_sparsity_mtx.get()); - GKO_ASSERT_MTX_NEAR(d_sparsity_mtx.get(), sparsity_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(d_sparsity_mtx.get(), sparsity_mtx.get(), 0); } @@ -517,9 +517,9 @@ TEST_F(Dense, ConvertToEllIsEquivalentToRef) srmtx->convert_to(drmtx.get()); somtx->convert_to(domtx.get()); - GKO_ASSERT_MTX_NEAR(drmtx, domtx, 1e-14); - GKO_ASSERT_MTX_NEAR(srmtx, somtx, 1e-14); - GKO_ASSERT_MTX_NEAR(domtx, omtx, 1e-14); + GKO_ASSERT_MTX_NEAR(drmtx, domtx, 0); + GKO_ASSERT_MTX_NEAR(srmtx, somtx, 0); + GKO_ASSERT_MTX_NEAR(domtx, omtx, 0); } @@ -537,9 +537,9 @@ TEST_F(Dense, MoveToEllIsEquivalentToRef) srmtx->move_to(drmtx.get()); somtx->move_to(domtx.get()); - GKO_ASSERT_MTX_NEAR(drmtx, domtx, 1e-14); - GKO_ASSERT_MTX_NEAR(srmtx, somtx, 1e-14); - GKO_ASSERT_MTX_NEAR(domtx, omtx, 1e-14); + GKO_ASSERT_MTX_NEAR(drmtx, domtx, 0); + GKO_ASSERT_MTX_NEAR(srmtx, somtx, 0); + GKO_ASSERT_MTX_NEAR(domtx, omtx, 0); } @@ -557,11 +557,9 @@ TEST_F(Dense, ConvertToHybridIsEquivalentToRef) srmtx->convert_to(drmtx.get()); somtx->convert_to(domtx.get()); - GKO_ASSERT_MTX_NEAR(drmtx, domtx, 1e-14); - // Test between `srmtx` and `somtx` may fail due to the OpenMP - // implementation not sorting the Coo matrix part. - // Therefore, it is not performed. - GKO_ASSERT_MTX_NEAR(domtx, omtx, 1e-14); + GKO_ASSERT_MTX_NEAR(drmtx, domtx, 0); + GKO_ASSERT_MTX_NEAR(srmtx, somtx, 0); + GKO_ASSERT_MTX_NEAR(domtx, omtx, 0); } @@ -579,11 +577,9 @@ TEST_F(Dense, MoveToHybridIsEquivalentToRef) srmtx->move_to(drmtx.get()); somtx->move_to(domtx.get()); - GKO_ASSERT_MTX_NEAR(drmtx, domtx, 1e-14); - // Test between `srmtx` and `somtx` may fail due to the OpenMP - // implementation not sorting the Coo matrix part. - // Therefore, it is not performed. - GKO_ASSERT_MTX_NEAR(domtx, omtx, 1e-14); + GKO_ASSERT_MTX_NEAR(drmtx, domtx, 0); + GKO_ASSERT_MTX_NEAR(srmtx, somtx, 0); + GKO_ASSERT_MTX_NEAR(domtx, omtx, 0); } @@ -601,9 +597,9 @@ TEST_F(Dense, ConvertToSellpIsEquivalentToRef) srmtx->convert_to(drmtx.get()); somtx->convert_to(domtx.get()); - GKO_ASSERT_MTX_NEAR(drmtx, domtx, 1e-14); - GKO_ASSERT_MTX_NEAR(srmtx, somtx, 1e-14); - GKO_ASSERT_MTX_NEAR(domtx, omtx, 1e-14); + GKO_ASSERT_MTX_NEAR(drmtx, domtx, 0); + GKO_ASSERT_MTX_NEAR(srmtx, somtx, 0); + GKO_ASSERT_MTX_NEAR(domtx, omtx, 0); } @@ -621,9 +617,9 @@ TEST_F(Dense, MoveToSellpIsEquivalentToRef) srmtx->move_to(drmtx.get()); somtx->move_to(domtx.get()); - GKO_ASSERT_MTX_NEAR(drmtx, domtx, 1e-14); - GKO_ASSERT_MTX_NEAR(srmtx, somtx, 1e-14); - GKO_ASSERT_MTX_NEAR(domtx, omtx, 1e-14); + GKO_ASSERT_MTX_NEAR(drmtx, domtx, 0); + GKO_ASSERT_MTX_NEAR(srmtx, somtx, 0); + GKO_ASSERT_MTX_NEAR(domtx, omtx, 0); } @@ -647,47 +643,31 @@ TEST_F(Dense, CalculateNNZPerRowIsEquivalentToRef) gko::Array dnnz_per_row(omp); dnnz_per_row.resize_and_reset(dx->get_size()[0]); - gko::kernels::reference::dense::calculate_nonzeros_per_row(ref, x.get(), - &nnz_per_row); - gko::kernels::omp::dense::calculate_nonzeros_per_row(omp, dx.get(), - &dnnz_per_row); + gko::kernels::reference::dense::count_nonzeros_per_row( + ref, x.get(), nnz_per_row.get_data()); + gko::kernels::omp::dense::count_nonzeros_per_row(omp, dx.get(), + dnnz_per_row.get_data()); GKO_ASSERT_ARRAY_EQ(nnz_per_row, dnnz_per_row); } -TEST_F(Dense, CalculateMaxNNZPerRowIsEquivalentToRef) +TEST_F(Dense, ComputeMaxNNZPerRowIsEquivalentToRef) { std::size_t ref_max_nnz_per_row = 0; std::size_t omp_max_nnz_per_row = 0; auto rmtx = gen_mtx(100, 100, 1); auto omtx = gko::clone(omp, rmtx); - gko::kernels::reference::dense::calculate_max_nnz_per_row( - ref, rmtx.get(), &ref_max_nnz_per_row); - gko::kernels::omp::dense::calculate_max_nnz_per_row(omp, omtx.get(), - &omp_max_nnz_per_row); + gko::kernels::reference::dense::compute_max_nnz_per_row( + ref, rmtx.get(), ref_max_nnz_per_row); + gko::kernels::omp::dense::compute_max_nnz_per_row(omp, omtx.get(), + omp_max_nnz_per_row); ASSERT_EQ(ref_max_nnz_per_row, omp_max_nnz_per_row); } -TEST_F(Dense, CalculateTotalColsIsEquivalentToRef) -{ - std::size_t ref_total_cols = 0; - std::size_t omp_total_cols = 0; - auto rmtx = gen_mtx(100, 100, 1); - auto omtx = gko::clone(omp, rmtx); - - gko::kernels::reference::dense::calculate_total_cols( - ref, rmtx.get(), &ref_total_cols, 1, gko::matrix::default_slice_size); - gko::kernels::omp::dense::calculate_total_cols( - omp, omtx.get(), &omp_total_cols, 1, gko::matrix::default_slice_size); - - ASSERT_EQ(ref_total_cols, omp_total_cols); -} - - TEST_F(Dense, IsTransposable) { set_up_apply_data(); diff --git a/omp/test/matrix/fbcsr_kernels.cpp b/omp/test/matrix/fbcsr_kernels.cpp index 8164359397f..2e7aa6ec0e3 100644 --- a/omp/test/matrix/fbcsr_kernels.cpp +++ b/omp/test/matrix/fbcsr_kernels.cpp @@ -286,21 +286,6 @@ TEST_F(Fbcsr, ConjugateTransposeIsEquivalentToRef) } -TEST_F(Fbcsr, CalculatesNonzerosPerRow) -{ - set_up_apply_data(); - gko::Array row_nnz(ref, mtx->get_size()[0]); - gko::Array drow_nnz(omp, dmtx->get_size()[0]); - - gko::kernels::reference::fbcsr::calculate_nonzeros_per_row(ref, mtx.get(), - &row_nnz); - gko::kernels::omp::fbcsr::calculate_nonzeros_per_row(omp, dmtx.get(), - &drow_nnz); - - GKO_ASSERT_ARRAY_EQ(row_nnz, drow_nnz); -} - - TEST_F(Fbcsr, RecognizeSortedMatrix) { set_up_apply_data(); @@ -402,21 +387,4 @@ TEST_F(Fbcsr, OutplaceAbsoluteComplexMatrixIsEquivalentToRef) } -TEST_F(Fbcsr, MaxNnzPerRowIsEquivalentToRefSortedBS3) -{ - auto mtx_ref = gko::test::generate_random_fbcsr( - ref, num_brows, num_bcols, blk_sz, false, false, rand_engine); - auto rand_omp = Mtx::create(omp); - rand_omp->copy_from(gko::lend(mtx_ref)); - gko::size_type ref_max_nnz{}, omp_max_nnz{}; - - gko::kernels::omp::fbcsr::calculate_max_nnz_per_row( - this->omp, rand_omp.get(), &omp_max_nnz); - gko::kernels::reference::fbcsr::calculate_max_nnz_per_row( - this->ref, mtx_ref.get(), &ref_max_nnz); - - ASSERT_EQ(ref_max_nnz, omp_max_nnz); -} - - } // namespace diff --git a/omp/test/matrix/hybrid_kernels.cpp b/omp/test/matrix/hybrid_kernels.cpp index 9ba71b36738..c9e4e6a0c87 100644 --- a/omp/test/matrix/hybrid_kernels.cpp +++ b/omp/test/matrix/hybrid_kernels.cpp @@ -267,7 +267,7 @@ TEST_F(Hybrid, ConvertToCsrIsEquivalentToRef) mtx->convert_to(csr_mtx.get()); dmtx->convert_to(dcsr_mtx.get()); - GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), 0); } diff --git a/omp/test/matrix/sellp_kernels.cpp b/omp/test/matrix/sellp_kernels.cpp index e7bc9536191..5aa3cdce700 100644 --- a/omp/test/matrix/sellp_kernels.cpp +++ b/omp/test/matrix/sellp_kernels.cpp @@ -251,7 +251,7 @@ TEST_F(Sellp, ConvertToDenseIsEquivalentToRef) mtx->convert_to(dense_mtx.get()); dmtx->convert_to(ddense_mtx.get()); - GKO_ASSERT_MTX_NEAR(dense_mtx.get(), ddense_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(dense_mtx.get(), ddense_mtx.get(), 0); } diff --git a/reference/matrix/csr_kernels.cpp b/reference/matrix/csr_kernels.cpp index 991c914f336..3b72aa336af 100644 --- a/reference/matrix/csr_kernels.cpp +++ b/reference/matrix/csr_kernels.cpp @@ -408,39 +408,15 @@ void convert_to_sellp(std::shared_ptr exec, auto col_idxs = result->get_col_idxs(); auto slice_lengths = result->get_slice_lengths(); auto slice_sets = result->get_slice_sets(); - auto slice_size = (result->get_slice_size() == 0) - ? matrix::default_slice_size - : result->get_slice_size(); - auto stride_factor = (result->get_stride_factor() == 0) - ? matrix::default_stride_factor - : result->get_stride_factor(); + auto slice_size = result->get_slice_size(); + auto stride_factor = result->get_stride_factor(); const auto source_row_ptrs = source->get_const_row_ptrs(); const auto source_col_idxs = source->get_const_col_idxs(); const auto source_values = source->get_const_values(); auto slice_num = ceildiv(num_rows, slice_size); - slice_sets[0] = 0; for (size_type slice = 0; slice < slice_num; slice++) { - if (slice > 0) { - slice_sets[slice] = - slice_sets[slice - 1] + slice_lengths[slice - 1]; - } - slice_lengths[slice] = 0; - for (size_type row = 0; row < slice_size; row++) { - size_type global_row = slice * slice_size + row; - if (global_row >= num_rows) { - break; - } - slice_lengths[slice] = - (slice_lengths[slice] > - source_row_ptrs[global_row + 1] - source_row_ptrs[global_row]) - ? slice_lengths[slice] - : source_row_ptrs[global_row + 1] - - source_row_ptrs[global_row]; - } - slice_lengths[slice] = - stride_factor * ceildiv(slice_lengths[slice], stride_factor); for (size_type row = 0; row < slice_size; row++) { size_type global_row = slice * slice_size + row; if (global_row >= num_rows) { @@ -462,48 +438,12 @@ void convert_to_sellp(std::shared_ptr exec, } } } - if (slice_num > 0) { - slice_sets[slice_num] = - slice_sets[slice_num - 1] + slice_lengths[slice_num - 1]; - } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_CONVERT_TO_SELLP_KERNEL); -template -void calculate_total_cols(std::shared_ptr exec, - const matrix::Csr* source, - size_type* result, size_type stride_factor, - size_type slice_size) -{ - size_type total_cols = 0; - const auto num_rows = source->get_size()[0]; - const auto slice_num = ceildiv(num_rows, slice_size); - - const auto row_ptrs = source->get_const_row_ptrs(); - - for (size_type slice = 0; slice < slice_num; slice++) { - IndexType max_nnz_per_row_in_this_slice = 0; - for (size_type row = 0; - row < slice_size && row + slice * slice_size < num_rows; row++) { - size_type global_row = slice * slice_size + row; - max_nnz_per_row_in_this_slice = - max(row_ptrs[global_row + 1] - row_ptrs[global_row], - max_nnz_per_row_in_this_slice); - } - total_cols += ceildiv(max_nnz_per_row_in_this_slice, stride_factor) * - stride_factor; - } - - *result = total_cols; -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CALCULATE_TOTAL_COLS_KERNEL); - - template void convert_to_ell(std::shared_ptr exec, const matrix::Csr* source, @@ -606,26 +546,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL); -template -void calculate_max_nnz_per_row(std::shared_ptr exec, - const matrix::Csr* source, - size_type* result) -{ - const auto num_rows = source->get_size()[0]; - const auto row_ptrs = source->get_const_row_ptrs(); - IndexType max_nnz = 0; - - for (size_type i = 0; i < num_rows; i++) { - max_nnz = std::max(row_ptrs[i + 1] - row_ptrs[i], max_nnz); - } - - *result = max_nnz; -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL); - - template void calculate_nonzeros_per_row_in_span( std::shared_ptr exec, @@ -687,6 +607,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void convert_to_hybrid(std::shared_ptr exec, const matrix::Csr* source, + const int64*, matrix::Hybrid* result) { auto num_rows = result->get_size()[0]; @@ -891,19 +812,18 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template -void calculate_nonzeros_per_row(std::shared_ptr exec, - const matrix::Csr* source, - Array* result) +void count_nonzeros_per_row(std::shared_ptr exec, + const matrix::Csr* source, + size_type* result) { const auto row_ptrs = source->get_const_row_ptrs(); - auto row_nnz_val = result->get_data(); - for (size_type i = 0; i < result->get_num_elems(); i++) { - row_nnz_val[i] = row_ptrs[i + 1] - row_ptrs[i]; + for (size_type i = 0; i < source->get_size()[0]; i++) { + result[i] = row_ptrs[i + 1] - row_ptrs[i]; } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CALCULATE_NONZEROS_PER_ROW_KERNEL); + GKO_DECLARE_CSR_COUNT_NONZEROS_PER_ROW_KERNEL); template diff --git a/reference/matrix/dense_kernels.cpp b/reference/matrix/dense_kernels.cpp index 99cf6ad390d..804237e4748 100644 --- a/reference/matrix/dense_kernels.cpp +++ b/reference/matrix/dense_kernels.cpp @@ -48,6 +48,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/components/prefix_sum_kernels.hpp" + + namespace gko { namespace kernels { namespace reference { @@ -89,7 +92,7 @@ void apply(std::shared_ptr exec, const matrix::Dense* a, const matrix::Dense* b, const matrix::Dense* beta, matrix::Dense* c) { - if (beta->at(0, 0) != zero()) { + if (is_nonzero(beta->at(0, 0))) { for (size_type row = 0; row < c->get_size()[0]; ++row) { for (size_type col = 0; col < c->get_size()[1]; ++col) { c->at(row, col) *= beta->at(0, 0); @@ -366,7 +369,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void convert_to_coo(std::shared_ptr exec, - const matrix::Dense* source, + const matrix::Dense* source, const int64*, matrix::Coo* result) { auto num_rows = result->get_size()[0]; @@ -377,11 +380,11 @@ void convert_to_coo(std::shared_ptr exec, auto col_idxs = result->get_col_idxs(); auto values = result->get_values(); - auto idxs = 0; + size_type idxs = 0; for (size_type row = 0; row < num_rows; ++row) { for (size_type col = 0; col < num_cols; ++col) { auto val = source->at(row, col); - if (val != zero()) { + if (is_nonzero(val)) { row_idxs[idxs] = row; col_idxs[idxs] = col; values[idxs] = val; @@ -413,7 +416,7 @@ void convert_to_csr(std::shared_ptr exec, for (size_type row = 0; row < num_rows; ++row) { for (size_type col = 0; col < num_cols; ++col) { auto val = source->at(row, col); - if (val != zero()) { + if (is_nonzero(val)) { col_idxs[cur_ptr] = col; values[cur_ptr] = val; ++cur_ptr; @@ -446,7 +449,7 @@ void convert_to_ell(std::shared_ptr exec, col_idx = 0; for (size_type col = 0; col < num_cols; col++) { auto val = source->at(row, col); - if (val != zero()) { + if (is_nonzero(val)) { result->val_at(row, col_idx) = val; result->col_at(row, col_idx) = col; col_idx++; @@ -461,7 +464,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void convert_to_hybrid(std::shared_ptr exec, - const matrix::Dense* source, + const matrix::Dense* source, const int64*, matrix::Hybrid* result) { auto num_rows = result->get_size()[0]; @@ -490,7 +493,7 @@ void convert_to_hybrid(std::shared_ptr exec, size_type col_idx = 0, col = 0; while (col < num_cols && col_idx < ell_lim) { auto val = source->at(row, col); - if (val != zero()) { + if (is_nonzero(val)) { result->ell_val_at(row, col_idx) = val; result->ell_col_at(row, col_idx) = col; col_idx++; @@ -499,7 +502,7 @@ void convert_to_hybrid(std::shared_ptr exec, } while (col < num_cols) { auto val = source->at(row, col); - if (val != zero()) { + if (is_nonzero(val)) { coo_val[coo_idx] = val; coo_col[coo_idx] = col; coo_row[coo_idx] = row; @@ -525,62 +528,24 @@ void convert_to_sellp(std::shared_ptr exec, auto col_idxs = result->get_col_idxs(); auto slice_lengths = result->get_slice_lengths(); auto slice_sets = result->get_slice_sets(); - auto slice_size = (result->get_slice_size() == 0) - ? matrix::default_slice_size - : result->get_slice_size(); - auto stride_factor = (result->get_stride_factor() == 0) - ? matrix::default_stride_factor - : result->get_stride_factor(); - auto slice_num = ceildiv(num_rows, slice_size); - slice_sets[0] = 0; - for (size_type slice = 0; slice < slice_num; slice++) { - if (slice > 0) { - slice_sets[slice] = - slice_sets[slice - 1] + slice_lengths[slice - 1]; - } - slice_lengths[slice] = 0; - for (size_type row = 0; row < slice_size; row++) { - size_type global_row = slice * slice_size + row; - if (global_row >= num_rows) { - break; - } - size_type max_col = 0; - for (size_type col = 0; col < num_cols; col++) { - if (source->at(global_row, col) != zero()) { - max_col += 1; - } - } - slice_lengths[slice] = std::max(slice_lengths[slice], max_col); - } - slice_lengths[slice] = - stride_factor * ceildiv(slice_lengths[slice], stride_factor); - for (size_type row = 0; row < slice_size; row++) { - size_type global_row = slice * slice_size + row; - if (global_row >= num_rows) { - break; - } - size_type sellp_ind = slice_sets[slice] * slice_size + row; - for (size_type col = 0; col < num_cols; col++) { - auto val = source->at(global_row, col); - if (val != zero()) { - col_idxs[sellp_ind] = col; - vals[sellp_ind] = val; - sellp_ind += slice_size; - } - } - for (size_type i = sellp_ind; - i < - (slice_sets[slice] + slice_lengths[slice]) * slice_size + row; - i += slice_size) { - col_idxs[i] = 0; - vals[i] = 0; + auto slice_size = result->get_slice_size(); + for (size_type row = 0; row < num_rows; row++) { + const auto slice = row / slice_size; + const auto local_row = row % slice_size; + auto sellp_ind = slice_sets[slice] * slice_size + local_row; + const auto sellp_end = slice_sets[slice + 1] * slice_size + local_row; + for (size_type col = 0; col < num_cols; col++) { + auto val = source->at(row, col); + if (is_nonzero(val)) { + col_idxs[sellp_ind] = col; + vals[sellp_ind] = val; + sellp_ind += slice_size; } } - } - - if (slice_num > 0) { - slice_sets[slice_num] = - slice_sets[slice_num - 1] + slice_lengths[slice_num - 1]; + for (; sellp_ind < sellp_end; sellp_ind += slice_size) { + col_idxs[sellp_ind] = 0; + vals[sellp_ind] = zero(); + } } } @@ -605,7 +570,7 @@ void convert_to_sparsity_csr(std::shared_ptr exec, for (size_type row = 0; row < num_rows; ++row) { for (size_type col = 0; col < num_cols; ++col) { auto val = source->at(row, col); - if (val != zero()) { + if (is_nonzero(val)) { col_idxs[cur_ptr] = col; ++cur_ptr; } @@ -619,102 +584,78 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template -void count_nonzeros(std::shared_ptr exec, - const matrix::Dense* source, size_type* result) +void compute_max_nnz_per_row(std::shared_ptr exec, + const matrix::Dense* source, + size_type& result) { auto num_rows = source->get_size()[0]; auto num_cols = source->get_size()[1]; - auto num_nonzeros = 0; - + result = 0; for (size_type row = 0; row < num_rows; ++row) { + size_type num_nonzeros = 0; for (size_type col = 0; col < num_cols; ++col) { - num_nonzeros += (source->at(row, col) != zero()); + num_nonzeros += is_nonzero(source->at(row, col)); } + result = std::max(num_nonzeros, result); } - - *result = num_nonzeros; } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COUNT_NONZEROS_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COMPUTE_MAX_NNZ_PER_ROW_KERNEL); template -void calculate_max_nnz_per_row(std::shared_ptr exec, - const matrix::Dense* source, - size_type* result) -{ - auto num_rows = source->get_size()[0]; - auto num_cols = source->get_size()[1]; - size_type num_stored_elements_per_row = 0; - size_type num_nonzeros = 0; - for (size_type row = 0; row < num_rows; ++row) { - num_nonzeros = 0; - for (size_type col = 0; col < num_cols; ++col) { - num_nonzeros += (source->at(row, col) != zero()); +void compute_slice_sets(std::shared_ptr exec, + const matrix::Dense* source, + size_type slice_size, size_type stride_factor, + size_type* slice_sets, size_type* slice_lengths) +{ + const auto num_rows = source->get_size()[0]; + const auto num_cols = source->get_size()[1]; + const auto num_slices = ceildiv(num_rows, slice_size); + for (size_type slice = 0; slice < num_slices; slice++) { + size_type slice_length = 0; + for (size_type local_row = 0; local_row < slice_size; local_row++) { + const auto row = slice * slice_size + local_row; + size_type row_nnz{}; + if (row < num_rows) { + for (size_type col = 0; col < num_cols; col++) { + row_nnz += is_nonzero(source->at(row, col)); + } + } + slice_length = std::max( + slice_length, ceildiv(row_nnz, stride_factor) * stride_factor); } - num_stored_elements_per_row = - std::max(num_nonzeros, num_stored_elements_per_row); + slice_lengths[slice] = slice_length; } - - *result = num_stored_elements_per_row; + exec->copy(num_slices, slice_lengths, slice_sets); + components::prefix_sum(exec, slice_sets, num_slices + 1); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_CALCULATE_MAX_NNZ_PER_ROW_KERNEL); + GKO_DECLARE_DENSE_COMPUTE_SLICE_SETS_KERNEL); -template -void calculate_nonzeros_per_row(std::shared_ptr exec, - const matrix::Dense* source, - Array* result) +template +void count_nonzeros_per_row(std::shared_ptr exec, + const matrix::Dense* source, + IndexType* result) { auto num_rows = source->get_size()[0]; auto num_cols = source->get_size()[1]; - auto row_nnz_val = result->get_data(); for (size_type row = 0; row < num_rows; ++row) { - size_type num_nonzeros = 0; + IndexType num_nonzeros{}; for (size_type col = 0; col < num_cols; ++col) { - num_nonzeros += (source->at(row, col) != zero()); - } - row_nnz_val[row] = num_nonzeros; - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_CALCULATE_NONZEROS_PER_ROW_KERNEL); - - -template -void calculate_total_cols(std::shared_ptr exec, - const matrix::Dense* source, - size_type* result, size_type stride_factor, - size_type slice_size) -{ - auto num_rows = source->get_size()[0]; - auto num_cols = source->get_size()[1]; - auto slice_num = ceildiv(num_rows, slice_size); - auto total_cols = 0; - auto temp = 0, slice_temp = 0; - for (size_type slice = 0; slice < slice_num; slice++) { - slice_temp = 0; - for (size_type row = 0; - row < slice_size && row + slice * slice_size < num_rows; row++) { - temp = 0; - for (size_type col = 0; col < num_cols; col++) { - temp += (source->at(row + slice * slice_size, col) != - zero()); - } - slice_temp = (slice_temp < temp) ? temp : slice_temp; + num_nonzeros += is_nonzero(source->at(row, col)); } - slice_temp = ceildiv(slice_temp, stride_factor) * stride_factor; - total_cols += slice_temp; + result[row] = num_nonzeros; } - - *result = total_cols; } +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL); GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_CALCULATE_TOTAL_COLS_KERNEL); + GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL_SIZE_T); template diff --git a/reference/matrix/ell_kernels.cpp b/reference/matrix/ell_kernels.cpp index 0c9c014a812..f3638a52217 100644 --- a/reference/matrix/ell_kernels.cpp +++ b/reference/matrix/ell_kernels.cpp @@ -231,7 +231,7 @@ void convert_to_csr(std::shared_ptr exec, for (size_type i = 0; i < max_nnz_per_row; i++) { const auto val = source->val_at(row, i); const auto col = source->col_at(row, i); - if (val != zero()) { + if (is_nonzero(val)) { values[cur_ptr] = val; col_idxs[cur_ptr] = col; cur_ptr++; @@ -257,8 +257,7 @@ void count_nonzeros_per_row(std::shared_ptr exec, for (size_type row = 0; row < num_rows; row++) { size_type nonzeros_in_this_row = 0; for (size_type i = 0; i < max_nnz_per_row; i++) { - nonzeros_in_this_row += - (source->val_at(row, i) != zero()); + nonzeros_in_this_row += is_nonzero(source->val_at(row, i)); } result[row] = nonzeros_in_this_row; } diff --git a/reference/matrix/fbcsr_kernels.cpp b/reference/matrix/fbcsr_kernels.cpp index e630c8c28a3..f0eca3355d5 100644 --- a/reference/matrix/fbcsr_kernels.cpp +++ b/reference/matrix/fbcsr_kernels.cpp @@ -397,50 +397,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_FBCSR_CONJ_TRANSPOSE_KERNEL); -template -void calculate_max_nnz_per_row( - std::shared_ptr, - const matrix::Fbcsr* const source, - size_type* const result) -{ - const auto num_rows = source->get_size()[0]; - const auto row_ptrs = source->get_const_row_ptrs(); - const int bs = source->get_block_size(); - IndexType max_nnz = 0; - - for (size_type i = 0; i < num_rows; i++) { - const size_type ibrow = i / bs; - max_nnz = - std::max((row_ptrs[ibrow + 1] - row_ptrs[ibrow]) * bs, max_nnz); - } - - *result = max_nnz; -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_FBCSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL); - - -template -void calculate_nonzeros_per_row( - std::shared_ptr, - const matrix::Fbcsr* source, Array* result) -{ - const auto row_ptrs = source->get_const_row_ptrs(); - auto row_nnz_val = result->get_data(); - const int bs = source->get_block_size(); - assert(result->get_num_elems() == source->get_size()[0]); - - for (size_type i = 0; i < result->get_num_elems(); i++) { - const size_type ibrow = i / bs; - row_nnz_val[i] = (row_ptrs[ibrow + 1] - row_ptrs[ibrow]) * bs; - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_FBCSR_CALCULATE_NONZEROS_PER_ROW_KERNEL); - - template void is_sorted_by_column_index( std::shared_ptr, diff --git a/reference/matrix/hybrid_kernels.cpp b/reference/matrix/hybrid_kernels.cpp index d901235a8e1..3b2f9badc54 100644 --- a/reference/matrix/hybrid_kernels.cpp +++ b/reference/matrix/hybrid_kernels.cpp @@ -42,6 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/components/format_conversion_kernels.hpp" +#include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/ell_kernels.hpp" @@ -56,6 +57,21 @@ namespace reference { namespace hybrid { +void compute_coo_row_ptrs(std::shared_ptr exec, + const Array& row_nnz, size_type ell_lim, + int64* coo_row_ptrs) +{ + for (size_type row = 0; row < row_nnz.get_num_elems(); row++) { + if (row_nnz.get_const_data()[row] <= ell_lim) { + coo_row_ptrs[row] = 0; + } else { + coo_row_ptrs[row] = row_nnz.get_const_data()[row] - ell_lim; + } + } + components::prefix_sum(exec, coo_row_ptrs, row_nnz.get_num_elems() + 1); +} + + void compute_row_nnz(std::shared_ptr exec, const Array& row_ptrs, size_type* row_nnzs) { @@ -125,7 +141,7 @@ void convert_to_csr(std::shared_ptr exec, // Ell part for (IndexType col = 0; col < max_nnz_per_row; col++) { const auto val = ell->val_at(row, col); - if (val != zero()) { + if (is_nonzero(val)) { csr_val[csr_idx] = val; csr_col_idxs[csr_idx] = ell->col_at(row, col); csr_idx++; diff --git a/reference/matrix/sellp_kernels.cpp b/reference/matrix/sellp_kernels.cpp index 443f39f56c2..c124189fabc 100644 --- a/reference/matrix/sellp_kernels.cpp +++ b/reference/matrix/sellp_kernels.cpp @@ -126,8 +126,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_SELLP_ADVANCED_SPMV_KERNEL); +template void compute_slice_sets(std::shared_ptr exec, - const Array& row_ptrs, size_type slice_size, + const Array& row_ptrs, size_type slice_size, size_type stride_factor, size_type* slice_sets, size_type* slice_lengths) { @@ -140,7 +141,7 @@ void compute_slice_sets(std::shared_ptr exec, const auto row = slice * slice_size + local_row; const auto row_length = row < num_rows ? row_ptrs_ptr[row + 1] - row_ptrs_ptr[row] - : int64{}; + : IndexType{}; slice_length = std::max( slice_length, ceildiv(row_length, stride_factor) * stride_factor); @@ -151,6 +152,9 @@ void compute_slice_sets(std::shared_ptr exec, components::prefix_sum(exec, slice_sets, num_slices + 1); } +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( + GKO_DECLARE_SELLP_COMPUTE_SLICE_SETS_KERNEL); + template void fill_in_matrix_data( @@ -224,9 +228,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template -void convert_to_csr(std::shared_ptr exec, - const matrix::Sellp* source, - matrix::Csr* result) +void count_nonzeros_per_row(std::shared_ptr exec, + const matrix::Sellp* source, + IndexType* result) { auto num_rows = source->get_size()[0]; auto slice_size = source->get_slice_size(); @@ -237,70 +241,72 @@ void convert_to_csr(std::shared_ptr exec, const auto source_slice_sets = source->get_const_slice_sets(); const auto source_col_idxs = source->get_const_col_idxs(); - auto result_vals = result->get_values(); - auto result_row_ptrs = result->get_row_ptrs(); - auto result_col_idxs = result->get_col_idxs(); - - size_type cur_ptr = 0; - for (size_type slice = 0; slice < slice_num; slice++) { for (size_type row = 0; row < slice_size; row++) { auto global_row = slice * slice_size + row; if (global_row >= num_rows) { break; } - result_row_ptrs[global_row] = cur_ptr; + IndexType row_nnz{}; for (size_type sellp_ind = source_slice_sets[slice] * slice_size + row; sellp_ind < source_slice_sets[slice + 1] * slice_size + row; sellp_ind += slice_size) { - if (source_vals[sellp_ind] != zero()) { - result_vals[cur_ptr] = source_vals[sellp_ind]; - result_col_idxs[cur_ptr] = source_col_idxs[sellp_ind]; - cur_ptr++; - } + row_nnz += is_nonzero(source_vals[sellp_ind]); } + result[global_row] = row_nnz; } } - result_row_ptrs[num_rows] = cur_ptr; } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_SELLP_CONVERT_TO_CSR_KERNEL); + GKO_DECLARE_SELLP_COUNT_NONZEROS_PER_ROW_KERNEL); template -void count_nonzeros(std::shared_ptr exec, +void convert_to_csr(std::shared_ptr exec, const matrix::Sellp* source, - size_type* result) + matrix::Csr* result) { auto num_rows = source->get_size()[0]; auto slice_size = source->get_slice_size(); auto slice_num = ceildiv(num_rows, slice_size); - const auto vals = source->get_const_values(); - const auto slice_sets = source->get_const_slice_sets(); + const auto source_vals = source->get_const_values(); + const auto source_slice_lengths = source->get_const_slice_lengths(); + const auto source_slice_sets = source->get_const_slice_sets(); + const auto source_col_idxs = source->get_const_col_idxs(); + + auto result_vals = result->get_values(); + auto result_row_ptrs = result->get_row_ptrs(); + auto result_col_idxs = result->get_col_idxs(); - auto num_nonzeros = 0; + size_type cur_ptr = 0; for (size_type slice = 0; slice < slice_num; slice++) { - for (size_type row = 0; - row < slice_size && slice_size * slice + row < num_rows; row++) { - for (size_type sellp_ind = slice_sets[slice] * slice_size + row; - sellp_ind < slice_sets[slice + 1] * slice_size + row; + for (size_type row = 0; row < slice_size; row++) { + auto global_row = slice * slice_size + row; + if (global_row >= num_rows) { + break; + } + result_row_ptrs[global_row] = cur_ptr; + for (size_type sellp_ind = + source_slice_sets[slice] * slice_size + row; + sellp_ind < source_slice_sets[slice + 1] * slice_size + row; sellp_ind += slice_size) { - if (vals[sellp_ind] != zero()) { - num_nonzeros++; + if (is_nonzero(source_vals[sellp_ind])) { + result_vals[cur_ptr] = source_vals[sellp_ind]; + result_col_idxs[cur_ptr] = source_col_idxs[sellp_ind]; + cur_ptr++; } } } } - - *result = num_nonzeros; + result_row_ptrs[num_rows] = cur_ptr; } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_SELLP_COUNT_NONZEROS_KERNEL); + GKO_DECLARE_SELLP_CONVERT_TO_CSR_KERNEL); template diff --git a/reference/matrix/sparsity_csr_kernels.cpp b/reference/matrix/sparsity_csr_kernels.cpp index afe7a2cdc79..ec70f05181b 100644 --- a/reference/matrix/sparsity_csr_kernels.cpp +++ b/reference/matrix/sparsity_csr_kernels.cpp @@ -133,6 +133,27 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_SPARSITY_CSR_FILL_IN_MATRIX_DATA_KERNEL); +template +void fill_in_dense(std::shared_ptr exec, + const matrix::SparsityCsr* input, + matrix::Dense* output) +{ + auto row_ptrs = input->get_const_row_ptrs(); + auto col_idxs = input->get_const_col_idxs(); + auto val = input->get_const_value()[0]; + + for (size_type row = 0; row < input->get_size()[0]; ++row) { + for (auto k = row_ptrs[row]; k < row_ptrs[row + 1]; ++k) { + auto col = col_idxs[k]; + output->at(row, col) = val; + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_SPARSITY_CSR_FILL_IN_DENSE_KERNEL); + + template void count_num_diagonal_elements( std::shared_ptr exec, diff --git a/reference/test/matrix/csr_kernels.cpp b/reference/test/matrix/csr_kernels.cpp index 3458ba3693c..b1161cff463 100644 --- a/reference/test/matrix/csr_kernels.cpp +++ b/reference/test/matrix/csr_kernels.cpp @@ -1013,8 +1013,8 @@ TYPED_TEST(Csr, CalculatesNonzerosPerRow) { gko::Array row_nnz(this->exec, this->mtx->get_size()[0]); - gko::kernels::reference::csr::calculate_nonzeros_per_row( - this->exec, this->mtx.get(), &row_nnz); + gko::kernels::reference::csr::count_nonzeros_per_row( + this->exec, this->mtx.get(), row_nnz.get_data()); auto row_nnz_val = row_nnz.get_data(); ASSERT_EQ(row_nnz_val[0], 3); @@ -1022,19 +1022,6 @@ TYPED_TEST(Csr, CalculatesNonzerosPerRow) } -TYPED_TEST(Csr, CalculatesTotalCols) -{ - gko::size_type total_cols; - gko::size_type stride_factor = gko::matrix::default_stride_factor; - gko::size_type slice_size = gko::matrix::default_slice_size; - - gko::kernels::reference::csr::calculate_total_cols( - this->exec, this->mtx.get(), &total_cols, stride_factor, slice_size); - - ASSERT_EQ(total_cols, 3); -} - - TYPED_TEST(Csr, ConvertsToEll) { using Ell = typename TestFixture::Ell; diff --git a/reference/test/matrix/dense_kernels.cpp b/reference/test/matrix/dense_kernels.cpp index 41e77f8afd5..e7dcee0f608 100644 --- a/reference/test/matrix/dense_kernels.cpp +++ b/reference/test/matrix/dense_kernels.cpp @@ -1197,7 +1197,8 @@ TYPED_TEST(Dense, ConvertsToEllWithStride) { using T = typename TestFixture::value_type; using Ell = typename gko::matrix::Ell; - auto ell_mtx = Ell::create(this->mtx6->get_executor(), gko::dim<2>{}, 0, 3); + auto ell_mtx = + Ell::create(this->mtx6->get_executor(), gko::dim<2>{2, 3}, 2, 3); this->mtx6->convert_to(ell_mtx.get()); auto v = ell_mtx->get_const_values(); @@ -1226,7 +1227,8 @@ TYPED_TEST(Dense, MovesToEllWithStride) { using T = typename TestFixture::value_type; using Ell = typename gko::matrix::Ell; - auto ell_mtx = Ell::create(this->mtx6->get_executor(), gko::dim<2>{}, 0, 3); + auto ell_mtx = + Ell::create(this->mtx6->get_executor(), gko::dim<2>{2, 3}, 2, 3); this->mtx6->move_to(ell_mtx.get()); auto v = ell_mtx->get_const_values(); @@ -1388,7 +1390,7 @@ TYPED_TEST(Dense, MovesToHybridWithStrideAutomatically) using T = typename TestFixture::value_type; using Hybrid = typename gko::matrix::Hybrid; auto hybrid_mtx = - Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{}, 0, 3); + Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{2, 3}, 0, 3); this->mtx4->move_to(hybrid_mtx.get()); auto v = hybrid_mtx->get_const_coo_values(); @@ -1422,7 +1424,7 @@ TYPED_TEST(Dense, ConvertsToHybridWithStrideAutomatically) using T = typename TestFixture::value_type; using Hybrid = typename gko::matrix::Hybrid; auto hybrid_mtx = - Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{}, 0, 3); + Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{2, 3}, 0, 3); this->mtx4->convert_to(hybrid_mtx.get()); auto v = hybrid_mtx->get_const_coo_values(); @@ -1456,7 +1458,7 @@ TYPED_TEST(Dense, MovesToHybridWithStrideAndCooLengthByColumns2) using T = typename TestFixture::value_type; using Hybrid = typename gko::matrix::Hybrid; auto hybrid_mtx = - Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{}, 0, 3, 3, + Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{2, 3}, 2, 3, 3, std::make_shared(2)); this->mtx4->move_to(hybrid_mtx.get()); @@ -1467,7 +1469,7 @@ TYPED_TEST(Dense, MovesToHybridWithStrideAndCooLengthByColumns2) ASSERT_EQ(hybrid_mtx->get_size(), gko::dim<2>(2, 3)); ASSERT_EQ(hybrid_mtx->get_ell_num_stored_elements(), 6); - ASSERT_EQ(hybrid_mtx->get_coo_num_stored_elements(), 3); + ASSERT_EQ(hybrid_mtx->get_coo_num_stored_elements(), 1); EXPECT_EQ(n, 2); EXPECT_EQ(p, 3); EXPECT_EQ(c[0], 0); @@ -1483,14 +1485,8 @@ TYPED_TEST(Dense, MovesToHybridWithStrideAndCooLengthByColumns2) EXPECT_EQ(v[4], T{0.0}); EXPECT_EQ(v[5], T{0.0}); EXPECT_EQ(hybrid_mtx->get_const_coo_values()[0], T{2.0}); - EXPECT_EQ(hybrid_mtx->get_const_coo_values()[1], T{0.0}); - EXPECT_EQ(hybrid_mtx->get_const_coo_values()[2], T{0.0}); - EXPECT_EQ(hybrid_mtx->get_const_coo_col_idxs()[0], 2); - EXPECT_EQ(hybrid_mtx->get_const_coo_col_idxs()[1], 0); - EXPECT_EQ(hybrid_mtx->get_const_coo_col_idxs()[2], 0); EXPECT_EQ(hybrid_mtx->get_const_coo_row_idxs()[0], 0); - EXPECT_EQ(hybrid_mtx->get_const_coo_row_idxs()[1], 0); - EXPECT_EQ(hybrid_mtx->get_const_coo_row_idxs()[2], 0); + EXPECT_EQ(hybrid_mtx->get_const_coo_col_idxs()[0], 2); } @@ -1499,7 +1495,7 @@ TYPED_TEST(Dense, ConvertsToHybridWithStrideAndCooLengthByColumns2) using T = typename TestFixture::value_type; using Hybrid = typename gko::matrix::Hybrid; auto hybrid_mtx = - Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{}, 0, 3, 3, + Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{2, 3}, 2, 3, 3, std::make_shared(2)); this->mtx4->convert_to(hybrid_mtx.get()); @@ -1510,7 +1506,7 @@ TYPED_TEST(Dense, ConvertsToHybridWithStrideAndCooLengthByColumns2) ASSERT_EQ(hybrid_mtx->get_size(), gko::dim<2>(2, 3)); ASSERT_EQ(hybrid_mtx->get_ell_num_stored_elements(), 6); - ASSERT_EQ(hybrid_mtx->get_coo_num_stored_elements(), 3); + ASSERT_EQ(hybrid_mtx->get_coo_num_stored_elements(), 1); EXPECT_EQ(n, 2); EXPECT_EQ(p, 3); EXPECT_EQ(c[0], 0); @@ -1525,15 +1521,9 @@ TYPED_TEST(Dense, ConvertsToHybridWithStrideAndCooLengthByColumns2) EXPECT_EQ(v[3], T{3.0}); EXPECT_EQ(v[4], T{0.0}); EXPECT_EQ(v[5], T{0.0}); - EXPECT_EQ(hybrid_mtx->get_const_coo_values()[0], T{2.0}); - EXPECT_EQ(hybrid_mtx->get_const_coo_values()[1], T{0.0}); - EXPECT_EQ(hybrid_mtx->get_const_coo_values()[2], T{0.0}); - EXPECT_EQ(hybrid_mtx->get_const_coo_col_idxs()[0], 2); - EXPECT_EQ(hybrid_mtx->get_const_coo_col_idxs()[1], 0); - EXPECT_EQ(hybrid_mtx->get_const_coo_col_idxs()[2], 0); EXPECT_EQ(hybrid_mtx->get_const_coo_row_idxs()[0], 0); - EXPECT_EQ(hybrid_mtx->get_const_coo_row_idxs()[1], 0); - EXPECT_EQ(hybrid_mtx->get_const_coo_row_idxs()[2], 0); + EXPECT_EQ(hybrid_mtx->get_const_coo_col_idxs()[0], 2); + EXPECT_EQ(hybrid_mtx->get_const_coo_values()[0], T{2.0}); } @@ -1542,7 +1532,7 @@ TYPED_TEST(Dense, MovesToHybridWithStrideByPercent40) using T = typename TestFixture::value_type; using Hybrid = typename gko::matrix::Hybrid; auto hybrid_mtx = - Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{}, 0, 3, + Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{2, 3}, 1, 3, std::make_shared(0.4)); this->mtx4->move_to(hybrid_mtx.get()); @@ -1579,7 +1569,7 @@ TYPED_TEST(Dense, ConvertsToHybridWithStrideByPercent40) using T = typename TestFixture::value_type; using Hybrid = typename gko::matrix::Hybrid; auto hybrid_mtx = - Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{}, 0, 3, + Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{2, 3}, 1, 3, std::make_shared(0.4)); this->mtx4->convert_to(hybrid_mtx.get()); diff --git a/reference/test/matrix/fbcsr_kernels.cpp b/reference/test/matrix/fbcsr_kernels.cpp index d2696b53d95..6db215bc364 100644 --- a/reference/test/matrix/fbcsr_kernels.cpp +++ b/reference/test/matrix/fbcsr_kernels.cpp @@ -517,39 +517,6 @@ TYPED_TEST(Fbcsr, MovesEmptyToSparsityCsr) } -TYPED_TEST(Fbcsr, CalculatesNonzerosPerRow) -{ - using IndexType = typename TestFixture::index_type; - gko::Array row_nnz(this->exec, this->mtx2->get_size()[0]); - - gko::kernels::reference::fbcsr::calculate_nonzeros_per_row( - this->exec, this->mtx2.get(), &row_nnz); - gko::Array refrnnz(this->exec, this->mtx2->get_size()[0]); - gko::kernels::reference::csr ::calculate_nonzeros_per_row( - this->exec, this->ref2csrmtx.get(), &refrnnz); - - ASSERT_EQ(row_nnz.get_num_elems(), refrnnz.get_num_elems()); - auto row_nnz_val = row_nnz.get_data(); - for (gko::size_type i = 0; i < this->mtx2->get_size()[0]; i++) - ASSERT_EQ(row_nnz_val[i], refrnnz.get_const_data()[i]); -} - - -TYPED_TEST(Fbcsr, CalculatesMaxNnzPerRow) -{ - using IndexType = typename TestFixture::index_type; - gko::size_type max_row_nnz{}; - - gko::kernels::reference::fbcsr::calculate_max_nnz_per_row( - this->exec, this->mtx2.get(), &max_row_nnz); - gko::size_type ref_max_row_nnz{}; - gko::kernels::reference::csr::calculate_max_nnz_per_row( - this->exec, this->ref2csrmtx.get(), &ref_max_row_nnz); - - ASSERT_EQ(max_row_nnz, ref_max_row_nnz); -} - - TYPED_TEST(Fbcsr, SquareMtxIsTransposable) { using Fbcsr = typename TestFixture::Mtx; diff --git a/reference/test/matrix/sellp_kernels.cpp b/reference/test/matrix/sellp_kernels.cpp index de032a39830..a4f1bb1c80c 100644 --- a/reference/test/matrix/sellp_kernels.cpp +++ b/reference/test/matrix/sellp_kernels.cpp @@ -606,17 +606,6 @@ TYPED_TEST(Sellp, MovesWithSliceSizeAndStrideFactorToCsr) } -TYPED_TEST(Sellp, CountsNonzeros) -{ - gko::size_type nonzeros; - - gko::kernels::reference::sellp::count_nonzeros(this->exec, this->mtx1.get(), - &nonzeros); - - ASSERT_EQ(nonzeros, 4); -} - - TYPED_TEST(Sellp, ExtractsDiagonal) { using T = typename TestFixture::value_type; From 03bd279d1f65d40f75aeb5997ca10538650b664a Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 9 Dec 2021 12:45:43 +0100 Subject: [PATCH 14/32] fix benchmark warnings --- benchmark/CMakeLists.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index af8d38e3eb3..7bcecb39fb6 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -12,6 +12,12 @@ endfunction() function(ginkgo_benchmark_cusparse_linops type def) add_library(cusparse_linops_${type} utils/cuda_linops.cu) + if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA") + # remove false positive CUDA warnings when calling one() and zero() + target_compile_options(cusparse_linops_${type} + PRIVATE + $<$:--expt-relaxed-constexpr>) + endif() # make the dependency public to catch issues target_compile_definitions(cusparse_linops_${type} PUBLIC ${def}) target_link_libraries(cusparse_linops_${type} Ginkgo::ginkgo ${CUDA_RUNTIME_LIBS} ${CUBLAS} ${CUSPARSE}) From 89e5dac3465cdf864aba1015e934edd90676d2f6 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 9 Dec 2021 12:47:40 +0100 Subject: [PATCH 15/32] fix sellp conversions --- common/cuda_hip/matrix/dense_kernels.hpp.inc | 33 ++++++++++---------- core/matrix/csr.cpp | 8 ++--- core/matrix/dense.cpp | 8 ++--- cuda/matrix/dense_kernels.cu | 1 + dpcpp/matrix/dense_kernels.dp.cpp | 8 ++--- hip/matrix/dense_kernels.hip.cpp | 9 ++---- 6 files changed, 26 insertions(+), 41 deletions(-) diff --git a/common/cuda_hip/matrix/dense_kernels.hpp.inc b/common/cuda_hip/matrix/dense_kernels.hpp.inc index 5376d690d5d..28eb7dd7332 100644 --- a/common/cuda_hip/matrix/dense_kernels.hpp.inc +++ b/common/cuda_hip/matrix/dense_kernels.hpp.inc @@ -201,28 +201,27 @@ __global__ __launch_bounds__(default_block_size) void fill_in_sellp( const auto slice_length = slice_lengths[slice]; auto warp = group::tiled_partition( group::this_thread_block()); - auto lane_prefix_mask = - (config::lane_mask_type(1) << warp.thread_rank()) - 1; - auto base_out_idx = slice_sets[slice]; + const auto lane = warp.thread_rank(); + const auto prefix_mask = (config::lane_mask_type{1} << lane) - 1; + const auto slice_end = slice_sets[slice + 1] * slice_size; + auto base_idx = slice_sets[slice] * slice_size + local_row; for (size_type i = 0; i < num_cols; i += config::warp_size) { - const auto col = i + warp.thread_rank(); - const auto pred = - col < num_cols ? is_nonzero(source[stride * row + col]) : false; + const auto col = i + lane; + const auto val = checked_load(source + stride * row, col, num_cols, + zero()); + const auto pred = is_nonzero(val); const auto mask = warp.ballot(pred); - const auto out_idx = - local_row + - (base_out_idx + popcnt(mask & lane_prefix_mask)) * slice_size; + const auto idx = base_idx + popcnt(mask & prefix_mask) * slice_size; if (pred) { - values[out_idx] = source[stride * row + col]; - col_idxs[out_idx] = col; + values[idx] = val; + col_idxs[idx] = col; } - base_out_idx += popcnt(mask); + base_idx += popcnt(mask) * slice_size; } - for (size_type i = base_out_idx + warp.thread_rank(); i < slice_length; - i += config::warp_size) { - const auto out_idx = local_row + i * slice_size; - values[out_idx] = zero(); - col_idxs[out_idx] = 0; + for (auto i = base_idx + lane * slice_size; i < slice_end; + i += config::warp_size * slice_size) { + values[i] = zero(); + col_idxs[i] = 0; } } } diff --git a/core/matrix/csr.cpp b/core/matrix/csr.cpp index 794a0404c57..f6e3794ccf9 100644 --- a/core/matrix/csr.cpp +++ b/core/matrix/csr.cpp @@ -256,12 +256,8 @@ void Csr::convert_to( Sellp* result) const { auto exec = this->get_executor(); - const auto stride_factor = (result->get_stride_factor() == 0) - ? default_stride_factor - : result->get_stride_factor(); - const auto slice_size = (result->get_slice_size() == 0) - ? default_slice_size - : result->get_slice_size(); + const auto stride_factor = result->get_stride_factor(); + const auto slice_size = result->get_slice_size(); const auto num_rows = this->get_size()[0]; const auto num_slices = ceildiv(num_rows, slice_size); auto tmp = make_temporary_clone(exec, result); diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp index 54319e63f57..3f6fe48877c 100644 --- a/core/matrix/dense.cpp +++ b/core/matrix/dense.cpp @@ -554,12 +554,8 @@ void Dense::convert_impl(Sellp* result) const { auto exec = this->get_executor(); const auto num_rows = this->get_size()[0]; - const auto stride_factor = (result->get_stride_factor() == 0) - ? default_stride_factor - : result->get_stride_factor(); - const auto slice_size = (result->get_slice_size() == 0) - ? default_slice_size - : result->get_slice_size(); + const auto stride_factor = result->get_stride_factor(); + const auto slice_size = result->get_slice_size(); const auto num_slices = ceildiv(num_rows, slice_size); auto tmp = make_temporary_clone(exec, result); tmp->stride_factor_ = stride_factor; diff --git a/cuda/matrix/dense_kernels.cu b/cuda/matrix/dense_kernels.cu index 9ee63d0954f..01c9d54c03f 100644 --- a/cuda/matrix/dense_kernels.cu +++ b/cuda/matrix/dense_kernels.cu @@ -44,6 +44,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/base/utils.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "cuda/base/config.hpp" #include "cuda/base/cublas_bindings.hpp" diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp index e511ddcbe2d..efc750d1424 100644 --- a/dpcpp/matrix/dense_kernels.dp.cpp +++ b/dpcpp/matrix/dense_kernels.dp.cpp @@ -672,12 +672,8 @@ void convert_to_sellp(std::shared_ptr exec, auto slice_lengths = result->get_slice_lengths(); auto slice_sets = result->get_slice_sets(); - const auto slice_size = (result->get_slice_size() == 0) - ? matrix::default_slice_size - : result->get_slice_size(); - const auto stride_factor = (result->get_stride_factor() == 0) - ? matrix::default_stride_factor - : result->get_stride_factor(); + const auto slice_size = result->get_slice_size(); + const auto stride_factor = result->get_stride_factor(); const int slice_num = ceildiv(num_rows, slice_size); auto nnz_per_row = Array(exec, num_rows); diff --git a/hip/matrix/dense_kernels.hip.cpp b/hip/matrix/dense_kernels.hip.cpp index 5b74518e23c..6a48c229964 100644 --- a/hip/matrix/dense_kernels.hip.cpp +++ b/hip/matrix/dense_kernels.hip.cpp @@ -47,6 +47,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/base/utils.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "hip/base/config.hip.hpp" #include "hip/base/hipblas_bindings.hip.hpp" @@ -267,12 +268,8 @@ void convert_to_sellp(std::shared_ptr exec, auto slice_lengths = result->get_slice_lengths(); auto slice_sets = result->get_slice_sets(); - const auto slice_size = (result->get_slice_size() == 0) - ? matrix::default_slice_size - : result->get_slice_size(); - const auto stride_factor = (result->get_stride_factor() == 0) - ? matrix::default_stride_factor - : result->get_stride_factor(); + const auto slice_size = result->get_slice_size(); + const auto stride_factor = result->get_stride_factor(); auto grid_dim = ceildiv(num_rows, default_block_size / config::warp_size); if (grid_dim > 0) { From ff833b4cc777ce94beaba5e559bc06997e02b62b Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 9 Dec 2021 12:49:58 +0100 Subject: [PATCH 16/32] fall back to classical in hip Csr SpMV --- hip/matrix/csr_kernels.hip.cpp | 146 +++++++++++++++++---------------- 1 file changed, 77 insertions(+), 69 deletions(-) diff --git a/hip/matrix/csr_kernels.hip.cpp b/hip/matrix/csr_kernels.hip.cpp index 265ce8c6ac0..be4d51fe1c0 100644 --- a/hip/matrix/csr_kernels.hip.cpp +++ b/hip/matrix/csr_kernels.hip.cpp @@ -310,29 +310,16 @@ void spmv(std::shared_ptr exec, return items_per_thread == compiled_info; }, syn::value_list(), syn::type_list<>(), exec, a, b, c); - } else if (a->get_strategy()->get_name() == "classical") { - IndexType max_length_per_row = 0; - using Tcsr = matrix::Csr; - if (auto strategy = - std::dynamic_pointer_cast( - a->get_strategy())) { - max_length_per_row = strategy->get_max_length_per_row(); - } else if (auto strategy = std::dynamic_pointer_cast< - const typename Tcsr::automatical>(a->get_strategy())) { - max_length_per_row = strategy->get_max_length_per_row(); - } else { - GKO_NOT_SUPPORTED(a->get_strategy()); - } - host_kernel::select_classical_spmv( - classical_kernels(), - [&max_length_per_row](int compiled_info) { - return max_length_per_row >= compiled_info; - }, - syn::value_list(), syn::type_list<>(), exec, a, b, c); - } else if (a->get_strategy()->get_name() == "sparselib" || - a->get_strategy()->get_name() == "cusparse") { - if (hipsparse::is_supported::value) { - // TODO: add implementation for int64 and multiple RHS + } else { + bool try_sparselib = (a->get_strategy()->get_name() == "sparselib" || + a->get_strategy()->get_name() == "cusparse"); + try_sparselib = try_sparselib && + hipsparse::is_supported::value; + try_sparselib = + try_sparselib && b->get_stride() == 1 && c->get_stride() == 1; + // rocSPARSE has issues with zero matrices + try_sparselib = try_sparselib && a->get_num_stored_elements() > 0; + if (try_sparselib) { auto handle = exec->get_hipsparse_handle(); auto descr = hipsparse::create_mat_descr(); { @@ -341,9 +328,6 @@ void spmv(std::shared_ptr exec, auto col_idxs = a->get_const_col_idxs(); auto alpha = one(); auto beta = zero(); - if (b->get_stride() != 1 || c->get_stride() != 1) { - GKO_NOT_IMPLEMENTED; - } hipsparse::spmv(handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, a->get_size()[0], a->get_size()[1], a->get_num_stored_elements(), &alpha, descr, @@ -352,10 +336,30 @@ void spmv(std::shared_ptr exec, } hipsparse::destroy(descr); } else { - GKO_NOT_IMPLEMENTED; + IndexType max_length_per_row = 0; + using Tcsr = matrix::Csr; + if (auto strategy = + std::dynamic_pointer_cast( + a->get_strategy())) { + max_length_per_row = strategy->get_max_length_per_row(); + } else if (auto strategy = std::dynamic_pointer_cast< + const typename Tcsr::automatical>( + a->get_strategy())) { + max_length_per_row = strategy->get_max_length_per_row(); + } else { + // as a fall-back: use average row length, at least 1 + max_length_per_row = std::max( + a->get_num_stored_elements() / + std::max(a->get_size()[0], 1), + 1); + } + host_kernel::select_classical_spmv( + classical_kernels(), + [&max_length_per_row](int compiled_info) { + return max_length_per_row >= compiled_info; + }, + syn::value_list(), syn::type_list<>(), exec, a, b, c); } - } else { - GKO_NOT_IMPLEMENTED; } } @@ -392,18 +396,31 @@ void advanced_spmv(std::shared_ptr exec, } else { GKO_NOT_SUPPORTED(nwarps); } - } else if (a->get_strategy()->get_name() == "sparselib" || - a->get_strategy()->get_name() == "cusparse") { - if (hipsparse::is_supported::value) { - // TODO: add implementation for int64 and multiple RHS + } else if (a->get_strategy()->get_name() == "merge_path") { + int items_per_thread = + host_kernel::compute_items_per_thread(exec); + host_kernel::select_merge_path_spmv( + compiled_kernels(), + [&items_per_thread](int compiled_info) { + return items_per_thread == compiled_info; + }, + syn::value_list(), syn::type_list<>(), exec, a, b, c, alpha, + beta); + } else { + bool try_sparselib = (a->get_strategy()->get_name() == "sparselib" || + a->get_strategy()->get_name() == "cusparse"); + try_sparselib = try_sparselib && + hipsparse::is_supported::value; + try_sparselib = + try_sparselib && b->get_stride() == 1 && c->get_stride() == 1; + // rocSPARSE has issues with zero matrices + try_sparselib = try_sparselib && a->get_num_stored_elements() > 0; + if (try_sparselib) { auto descr = hipsparse::create_mat_descr(); auto row_ptrs = a->get_const_row_ptrs(); auto col_idxs = a->get_const_col_idxs(); - if (b->get_stride() != 1 || c->get_stride() != 1) - GKO_NOT_IMPLEMENTED; - hipsparse::spmv(exec->get_hipsparse_handle(), HIPSPARSE_OPERATION_NON_TRANSPOSE, a->get_size()[0], a->get_size()[1], a->get_num_stored_elements(), @@ -414,40 +431,31 @@ void advanced_spmv(std::shared_ptr exec, hipsparse::destroy(descr); } else { - GKO_NOT_IMPLEMENTED; - } - } else if (a->get_strategy()->get_name() == "classical") { - IndexType max_length_per_row = 0; - using Tcsr = matrix::Csr; - if (auto strategy = - std::dynamic_pointer_cast( - a->get_strategy())) { - max_length_per_row = strategy->get_max_length_per_row(); - } else if (auto strategy = std::dynamic_pointer_cast< - const typename Tcsr::automatical>(a->get_strategy())) { - max_length_per_row = strategy->get_max_length_per_row(); - } else { - GKO_NOT_SUPPORTED(a->get_strategy()); + IndexType max_length_per_row = 0; + using Tcsr = matrix::Csr; + if (auto strategy = + std::dynamic_pointer_cast( + a->get_strategy())) { + max_length_per_row = strategy->get_max_length_per_row(); + } else if (auto strategy = std::dynamic_pointer_cast< + const typename Tcsr::automatical>( + a->get_strategy())) { + max_length_per_row = strategy->get_max_length_per_row(); + } else { + // as a fall-back: use average row length, at least 1 + max_length_per_row = std::max( + a->get_num_stored_elements() / + std::max(a->get_size()[0], 1), + 1); + } + host_kernel::select_classical_spmv( + classical_kernels(), + [&max_length_per_row](int compiled_info) { + return max_length_per_row >= compiled_info; + }, + syn::value_list(), syn::type_list<>(), exec, a, b, c, + alpha, beta); } - host_kernel::select_classical_spmv( - classical_kernels(), - [&max_length_per_row](int compiled_info) { - return max_length_per_row >= compiled_info; - }, - syn::value_list(), syn::type_list<>(), exec, a, b, c, alpha, - beta); - } else if (a->get_strategy()->get_name() == "merge_path") { - int items_per_thread = - host_kernel::compute_items_per_thread(exec); - host_kernel::select_merge_path_spmv( - compiled_kernels(), - [&items_per_thread](int compiled_info) { - return items_per_thread == compiled_info; - }, - syn::value_list(), syn::type_list<>(), exec, a, b, c, alpha, - beta); - } else { - GKO_NOT_IMPLEMENTED; } } From 57c56f9f0ab6332f81e2c8624645a1bbc6e0eb48 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 9 Dec 2021 12:59:49 +0100 Subject: [PATCH 17/32] fix sellp SpMV on GPUs --- common/cuda_hip/matrix/sellp_kernels.hpp.inc | 61 ++++++++--------- cuda/matrix/sellp_kernels.cu | 12 ++-- dpcpp/matrix/sellp_kernels.dp.cpp | 70 +++++++++----------- hip/matrix/sellp_kernels.hip.cpp | 18 ++--- 4 files changed, 73 insertions(+), 88 deletions(-) diff --git a/common/cuda_hip/matrix/sellp_kernels.hpp.inc b/common/cuda_hip/matrix/sellp_kernels.hpp.inc index 49076960a32..6baeb013dcb 100644 --- a/common/cuda_hip/matrix/sellp_kernels.hpp.inc +++ b/common/cuda_hip/matrix/sellp_kernels.hpp.inc @@ -32,55 +32,48 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. template -__global__ __launch_bounds__(matrix::default_slice_size) void spmv_kernel( +__global__ __launch_bounds__(default_block_size) void spmv_kernel( size_type num_rows, size_type num_right_hand_sides, size_type b_stride, - size_type c_stride, const size_type* __restrict__ slice_lengths, + size_type c_stride, size_type slice_size, const size_type* __restrict__ slice_sets, const ValueType* __restrict__ a, const IndexType* __restrict__ col, const ValueType* __restrict__ b, ValueType* __restrict__ c) { - const auto slice_id = blockIdx.x; - const auto slice_size = blockDim.x; - const auto row_in_slice = threadIdx.x; - const auto global_row = - static_cast(slice_size) * slice_id + row_in_slice; + const auto row = thread::get_thread_id_flat(); + const auto slice_id = row / slice_size; + const auto row_in_slice = row % slice_size; const auto column_id = blockIdx.y; - ValueType val = 0; - IndexType ind = 0; - if (global_row < num_rows && column_id < num_right_hand_sides) { - for (size_type i = 0; i < slice_lengths[slice_id]; i++) { - ind = row_in_slice + (slice_sets[slice_id] + i) * slice_size; + auto val = zero(); + if (row < num_rows && column_id < num_right_hand_sides) { + for (auto i = slice_sets[slice_id]; i < slice_sets[slice_id + 1]; i++) { + const auto ind = row_in_slice + i * slice_size; val += a[ind] * b[col[ind] * b_stride + column_id]; } - c[global_row * c_stride + column_id] = val; + c[row * c_stride + column_id] = val; } } template -__global__ - __launch_bounds__(matrix::default_slice_size) void advanced_spmv_kernel( - size_type num_rows, size_type num_right_hand_sides, size_type b_stride, - size_type c_stride, const size_type* __restrict__ slice_lengths, - const size_type* __restrict__ slice_sets, - const ValueType* __restrict__ alpha, const ValueType* __restrict__ a, - const IndexType* __restrict__ col, const ValueType* __restrict__ b, - const ValueType* __restrict__ beta, ValueType* __restrict__ c) +__global__ __launch_bounds__(default_block_size) void advanced_spmv_kernel( + size_type num_rows, size_type num_right_hand_sides, size_type b_stride, + size_type c_stride, size_type slice_size, + const size_type* __restrict__ slice_sets, + const ValueType* __restrict__ alpha, const ValueType* __restrict__ a, + const IndexType* __restrict__ col, const ValueType* __restrict__ b, + const ValueType* __restrict__ beta, ValueType* __restrict__ c) { - const auto slice_id = blockIdx.x; - const auto slice_size = blockDim.x; - const auto row_in_slice = threadIdx.x; - const auto global_row = - static_cast(slice_size) * slice_id + row_in_slice; + const auto row = thread::get_thread_id_flat(); + const auto slice_id = row / slice_size; + const auto row_in_slice = row % slice_size; const auto column_id = blockIdx.y; - ValueType val = 0; - IndexType ind = 0; - if (global_row < num_rows && column_id < num_right_hand_sides) { - for (size_type i = 0; i < slice_lengths[slice_id]; i++) { - ind = row_in_slice + (slice_sets[slice_id] + i) * slice_size; - val += alpha[0] * a[ind] * b[col[ind] * b_stride + column_id]; + auto val = zero(); + if (row < num_rows && column_id < num_right_hand_sides) { + for (auto i = slice_sets[slice_id]; i < slice_sets[slice_id + 1]; i++) { + const auto ind = row_in_slice + i * slice_size; + val += a[ind] * b[col[ind] * b_stride + column_id]; } - c[global_row * c_stride + column_id] = - beta[0] * c[global_row * c_stride + column_id] + val; + c[row * c_stride + column_id] = + beta[0] * c[row * c_stride + column_id] + alpha[0] * val; } } diff --git a/cuda/matrix/sellp_kernels.cu b/cuda/matrix/sellp_kernels.cu index 15683d4d3e5..58e2cf96e2f 100644 --- a/cuda/matrix/sellp_kernels.cu +++ b/cuda/matrix/sellp_kernels.cu @@ -70,13 +70,13 @@ void spmv(std::shared_ptr exec, const matrix::Sellp* a, const matrix::Dense* b, matrix::Dense* c) { - const dim3 blockSize(matrix::default_slice_size); - const dim3 gridSize(ceildiv(a->get_size()[0], matrix::default_slice_size), + const dim3 blockSize(default_block_size); + const dim3 gridSize(ceildiv(a->get_size()[0], default_block_size), b->get_size()[1]); spmv_kernel<<>>( a->get_size()[0], b->get_size()[1], b->get_stride(), c->get_stride(), - a->get_const_slice_lengths(), a->get_const_slice_sets(), + a->get_slice_size(), a->get_const_slice_sets(), as_cuda_type(a->get_const_values()), a->get_const_col_idxs(), as_cuda_type(b->get_const_values()), as_cuda_type(c->get_values())); } @@ -92,13 +92,13 @@ void advanced_spmv(std::shared_ptr exec, const matrix::Dense* beta, matrix::Dense* c) { - const dim3 blockSize(matrix::default_slice_size); - const dim3 gridSize(ceildiv(a->get_size()[0], matrix::default_slice_size), + const dim3 blockSize(default_block_size); + const dim3 gridSize(ceildiv(a->get_size()[0], default_block_size), b->get_size()[1]); advanced_spmv_kernel<<>>( a->get_size()[0], b->get_size()[1], b->get_stride(), c->get_stride(), - a->get_const_slice_lengths(), a->get_const_slice_sets(), + a->get_slice_size(), a->get_const_slice_sets(), as_cuda_type(alpha->get_const_values()), as_cuda_type(a->get_const_values()), a->get_const_col_idxs(), as_cuda_type(b->get_const_values()), diff --git a/dpcpp/matrix/sellp_kernels.dp.cpp b/dpcpp/matrix/sellp_kernels.dp.cpp index fa37e6b8e7e..06c9c5b7e80 100644 --- a/dpcpp/matrix/sellp_kernels.dp.cpp +++ b/dpcpp/matrix/sellp_kernels.dp.cpp @@ -71,28 +71,24 @@ namespace { template void spmv_kernel(size_type num_rows, size_type num_right_hand_sides, - size_type b_stride, size_type c_stride, - const size_type* __restrict__ slice_lengths, + size_type b_stride, size_type c_stride, size_type slice_size, const size_type* __restrict__ slice_sets, const ValueType* __restrict__ a, const IndexType* __restrict__ col, const ValueType* __restrict__ b, ValueType* __restrict__ c, sycl::nd_item<3> item_ct1) { - const auto slice_id = item_ct1.get_group(2); - const auto slice_size = item_ct1.get_local_range().get(2); - const auto row_in_slice = item_ct1.get_local_id(2); - const auto global_row = - static_cast(slice_size) * slice_id + row_in_slice; + const auto row = thread::get_thread_id_flat(item_ct1); + const auto slice_id = row / slice_size; + const auto row_in_slice = row % slice_size; const auto column_id = item_ct1.get_group(1); - ValueType val = 0; - IndexType ind = 0; - if (global_row < num_rows && column_id < num_right_hand_sides) { - for (size_type i = 0; i < slice_lengths[slice_id]; i++) { - ind = row_in_slice + (slice_sets[slice_id] + i) * slice_size; + auto val = zero(); + if (row < num_rows && column_id < num_right_hand_sides) { + for (auto i = slice_sets[slice_id]; i < slice_sets[slice_id + 1]; i++) { + const auto ind = row_in_slice + i * slice_size; val += a[ind] * b[col[ind] * b_stride + column_id]; } - c[global_row * c_stride + column_id] = val; + c[row * c_stride + column_id] = val; } } @@ -102,7 +98,7 @@ GKO_ENABLE_DEFAULT_HOST(spmv_kernel, spmv_kernel); template void advanced_spmv_kernel(size_type num_rows, size_type num_right_hand_sides, size_type b_stride, size_type c_stride, - const size_type* __restrict__ slice_lengths, + size_type slice_size, const size_type* __restrict__ slice_sets, const ValueType* __restrict__ alpha, const ValueType* __restrict__ a, @@ -111,21 +107,18 @@ void advanced_spmv_kernel(size_type num_rows, size_type num_right_hand_sides, const ValueType* __restrict__ beta, ValueType* __restrict__ c, sycl::nd_item<3> item_ct1) { - const auto slice_id = item_ct1.get_group(2); - const auto slice_size = item_ct1.get_local_range().get(2); - const auto row_in_slice = item_ct1.get_local_id(2); - const auto global_row = - static_cast(slice_size) * slice_id + row_in_slice; + const auto row = thread::get_thread_id_flat(item_ct1); + const auto slice_id = row / slice_size; + const auto row_in_slice = row % slice_size; const auto column_id = item_ct1.get_group(1); - ValueType val = 0; - IndexType ind = 0; - if (global_row < num_rows && column_id < num_right_hand_sides) { - for (size_type i = 0; i < slice_lengths[slice_id]; i++) { - ind = row_in_slice + (slice_sets[slice_id] + i) * slice_size; - val += alpha[0] * a[ind] * b[col[ind] * b_stride + column_id]; + auto val = zero(); + if (row < num_rows && column_id < num_right_hand_sides) { + for (auto i = slice_sets[slice_id]; i < slice_sets[slice_id + 1]; i++) { + const auto ind = row_in_slice + i * slice_size; + val += a[ind] * b[col[ind] * b_stride + column_id]; } - c[global_row * c_stride + column_id] = - beta[0] * c[global_row * c_stride + column_id] + val; + c[row * c_stride + column_id] = + beta[0] * c[row * c_stride + column_id] + alpha[0] * val; } } @@ -140,13 +133,13 @@ void spmv(std::shared_ptr exec, const matrix::Sellp* a, const matrix::Dense* b, matrix::Dense* c) { - const dim3 blockSize(matrix::default_slice_size); - const dim3 gridSize(ceildiv(a->get_size()[0], matrix::default_slice_size), + const dim3 blockSize(default_block_size); + const dim3 gridSize(ceildiv(a->get_size()[0], default_block_size), b->get_size()[1]); spmv_kernel(gridSize, blockSize, 0, exec->get_queue(), a->get_size()[0], b->get_size()[1], b->get_stride(), c->get_stride(), - a->get_const_slice_lengths(), a->get_const_slice_sets(), + a->get_slice_size(), a->get_const_slice_sets(), a->get_const_values(), a->get_const_col_idxs(), b->get_const_values(), c->get_values()); } @@ -162,17 +155,16 @@ void advanced_spmv(std::shared_ptr exec, const matrix::Dense* beta, matrix::Dense* c) { - const dim3 blockSize(matrix::default_slice_size); - const dim3 gridSize(ceildiv(a->get_size()[0], matrix::default_slice_size), + const dim3 blockSize(default_block_size); + const dim3 gridSize(ceildiv(a->get_size()[0], default_block_size), b->get_size()[1]); - advanced_spmv_kernel(gridSize, blockSize, 0, exec->get_queue(), - a->get_size()[0], b->get_size()[1], b->get_stride(), - c->get_stride(), a->get_const_slice_lengths(), - a->get_const_slice_sets(), alpha->get_const_values(), - a->get_const_values(), a->get_const_col_idxs(), - b->get_const_values(), beta->get_const_values(), - c->get_values()); + advanced_spmv_kernel( + gridSize, blockSize, 0, exec->get_queue(), a->get_size()[0], + b->get_size()[1], b->get_stride(), c->get_stride(), a->get_slice_size(), + a->get_const_slice_sets(), alpha->get_const_values(), + a->get_const_values(), a->get_const_col_idxs(), b->get_const_values(), + beta->get_const_values(), c->get_values()); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( diff --git a/hip/matrix/sellp_kernels.hip.cpp b/hip/matrix/sellp_kernels.hip.cpp index 08efddec844..af381e4b906 100644 --- a/hip/matrix/sellp_kernels.hip.cpp +++ b/hip/matrix/sellp_kernels.hip.cpp @@ -73,16 +73,16 @@ void spmv(std::shared_ptr exec, const matrix::Sellp* a, const matrix::Dense* b, matrix::Dense* c) { - const dim3 blockSize(matrix::default_slice_size); - const dim3 gridSize(ceildiv(a->get_size()[0], matrix::default_slice_size), + const dim3 blockSize(default_block_size); + const dim3 gridSize(ceildiv(a->get_size()[0], default_block_size), b->get_size()[1]); hipLaunchKernelGGL( spmv_kernel, dim3(gridSize), dim3(blockSize), 0, 0, a->get_size()[0], - b->get_size()[1], b->get_stride(), c->get_stride(), - a->get_const_slice_lengths(), a->get_const_slice_sets(), - as_hip_type(a->get_const_values()), a->get_const_col_idxs(), - as_hip_type(b->get_const_values()), as_hip_type(c->get_values())); + b->get_size()[1], b->get_stride(), c->get_stride(), a->get_slice_size(), + a->get_const_slice_sets(), as_hip_type(a->get_const_values()), + a->get_const_col_idxs(), as_hip_type(b->get_const_values()), + as_hip_type(c->get_values())); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_SPMV_KERNEL); @@ -96,14 +96,14 @@ void advanced_spmv(std::shared_ptr exec, const matrix::Dense* beta, matrix::Dense* c) { - const dim3 blockSize(matrix::default_slice_size); - const dim3 gridSize(ceildiv(a->get_size()[0], matrix::default_slice_size), + const dim3 blockSize(default_block_size); + const dim3 gridSize(ceildiv(a->get_size()[0], default_block_size), b->get_size()[1]); hipLaunchKernelGGL( advanced_spmv_kernel, dim3(gridSize), dim3(blockSize), 0, 0, a->get_size()[0], b->get_size()[1], b->get_stride(), c->get_stride(), - a->get_const_slice_lengths(), a->get_const_slice_sets(), + a->get_slice_size(), a->get_const_slice_sets(), as_hip_type(alpha->get_const_values()), as_hip_type(a->get_const_values()), a->get_const_col_idxs(), as_hip_type(b->get_const_values()), From eae6e48b157ca0548b33b82e5b11dbea5a6db876 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 9 Dec 2021 13:00:18 +0100 Subject: [PATCH 18/32] fix const-correctness for Hybrid strategy --- include/ginkgo/core/matrix/hybrid.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/ginkgo/core/matrix/hybrid.hpp b/include/ginkgo/core/matrix/hybrid.hpp index f7fd81e8b08..6c85151f14b 100644 --- a/include/ginkgo/core/matrix/hybrid.hpp +++ b/include/ginkgo/core/matrix/hybrid.hpp @@ -351,7 +351,7 @@ class Hybrid * * @retrun percent */ - auto get_percentage() { return strategy_.get_percentage(); } + auto get_percentage() const { return strategy_.get_percentage(); } private: imbalance_limit strategy_; From b2dcca0aa4215d6541d52e62a3af2b46698b858c Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 9 Dec 2021 13:00:55 +0100 Subject: [PATCH 19/32] zero Fbcsr and Sellp pointer arrays --- include/ginkgo/core/matrix/fbcsr.hpp | 1 + include/ginkgo/core/matrix/sellp.hpp | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/include/ginkgo/core/matrix/fbcsr.hpp b/include/ginkgo/core/matrix/fbcsr.hpp index 3569e6f4866..47d8c5039c9 100644 --- a/include/ginkgo/core/matrix/fbcsr.hpp +++ b/include/ginkgo/core/matrix/fbcsr.hpp @@ -378,6 +378,7 @@ class Fbcsr : public EnableLinOp>, row_ptrs_(exec, detail::get_num_blocks(block_size, size[0]) + 1) { GKO_ASSERT_BLOCK_SIZE_CONFORMANT(size[1], bs_); + row_ptrs_.fill(0); } /** diff --git a/include/ginkgo/core/matrix/sellp.hpp b/include/ginkgo/core/matrix/sellp.hpp index 4f66c72f328..321f6f425cd 100644 --- a/include/ginkgo/core/matrix/sellp.hpp +++ b/include/ginkgo/core/matrix/sellp.hpp @@ -335,7 +335,10 @@ class Sellp : public EnableLinOp>, slice_sets_(exec, ceildiv(size[0], slice_size) + 1), slice_size_(slice_size), stride_factor_(stride_factor) - {} + { + slice_sets_.fill(0); + slice_lengths_.fill(0); + } void apply_impl(const LinOp* b, LinOp* x) const override; From 72f0bbee9d1310854dda3cc1c1ff955952cb402d Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 9 Dec 2021 13:04:59 +0100 Subject: [PATCH 20/32] work around cuSPARSE issues with empty matrices --- cuda/matrix/csr_kernels.cu | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.cu index 5987898ee9f..ce746cc6364 100644 --- a/cuda/matrix/csr_kernels.cu +++ b/cuda/matrix/csr_kernels.cu @@ -317,6 +317,10 @@ bool try_general_sparselib_spmv(std::shared_ptr exec, cusparse::destroy(descr); #else // CUDA_VERSION >= 11000 + // workaround for a division by zero in cuSPARSE 11.? + if (a->get_size()[1] == 0) { + return false; + } cusparseOperation_t trans = CUSPARSE_OPERATION_NON_TRANSPOSE; auto row_ptrs = const_cast(a->get_const_row_ptrs()); auto col_idxs = const_cast(a->get_const_col_idxs()); From 77422eec29646dcfeb7e0c5e7df00f2cfe35ae6a Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 9 Dec 2021 13:05:27 +0100 Subject: [PATCH 21/32] add Fbcsr conversion to Dense --- omp/matrix/fbcsr_kernels.cpp | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/omp/matrix/fbcsr_kernels.cpp b/omp/matrix/fbcsr_kernels.cpp index 1cffc0334aa..4dcc13b2140 100644 --- a/omp/matrix/fbcsr_kernels.cpp +++ b/omp/matrix/fbcsr_kernels.cpp @@ -212,7 +212,31 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void fill_in_dense(std::shared_ptr exec, const matrix::Fbcsr* const source, - matrix::Dense* const result) GKO_NOT_IMPLEMENTED; + matrix::Dense* const result) +{ + const auto bs = source->get_block_size(); + const auto nbrows = source->get_num_block_rows(); + const auto nbnz = source->get_num_stored_blocks(); + auto row_ptrs = source->get_const_row_ptrs(); + auto col_idxs = source->get_const_col_idxs(); + const acc::range> values{ + to_std_array(nbnz, bs, bs), source->get_const_values()}; +#pragma omp parallel for + for (size_type block_row = 0; block_row < nbrows; block_row++) { + const auto row_begin = row_ptrs[block_row]; + const auto row_end = row_ptrs[block_row + 1]; + for (auto block = row_begin; block < row_end; block++) { + const auto block_col = col_idxs[block]; + for (int local_row = 0; local_row < bs; local_row++) { + const auto row = block_row * bs + local_row; + for (int local_col = 0; local_col < bs; local_col++) { + const auto col = block_col * bs + local_col; + result->at(row, col) = values(block, local_row, local_col); + } + } + } + } +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_FBCSR_FILL_IN_DENSE_KERNEL); From 31ae3e678e61c6e0708be4dbdff8743d1cbed1fb Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 9 Dec 2021 14:08:22 +0100 Subject: [PATCH 22/32] guard empty CUDA kernel launches --- cuda/base/kernel_launch_reduction.cuh | 115 +++++--- cuda/components/prefix_sum_kernels.cu | 8 +- cuda/components/reduction.cuh | 7 +- cuda/factorization/factorization_kernels.cu | 87 +++--- cuda/factorization/par_ic_kernels.cu | 19 +- cuda/factorization/par_ict_kernels.cu | 38 +-- cuda/factorization/par_ilu_kernels.cu | 18 +- .../par_ilut_approx_filter_kernel.cu | 17 +- cuda/factorization/par_ilut_filter_kernel.cu | 21 +- cuda/factorization/par_ilut_select_common.cu | 8 +- cuda/factorization/par_ilut_select_kernel.cu | 8 +- cuda/factorization/par_ilut_spgeam_kernel.cu | 25 +- cuda/factorization/par_ilut_sweep_kernel.cu | 22 +- cuda/matrix/csr_kernels.cu | 269 +++++++++++------- cuda/matrix/diagonal_kernels.cu | 8 +- cuda/matrix/ell_kernels.cu | 30 +- cuda/matrix/fbcsr_kernels.cu | 33 ++- cuda/matrix/sellp_kernels.cu | 29 +- cuda/multigrid/amgx_pgm_kernels.cu | 60 ++-- cuda/preconditioner/isai_kernels.cu | 92 +++--- .../jacobi_advanced_apply_instantiate.inc.cu | 29 +- .../jacobi_generate_instantiate.inc.cu | 36 +-- cuda/preconditioner/jacobi_kernels.cu | 41 +-- .../jacobi_simple_apply_instantiate.inc.cu | 27 +- cuda/solver/multigrid_kernels.cu | 42 +-- cuda/stop/criterion_kernels.cu | 8 +- cuda/stop/residual_norm_kernels.cu | 28 +- 27 files changed, 665 insertions(+), 460 deletions(-) diff --git a/cuda/base/kernel_launch_reduction.cuh b/cuda/base/kernel_launch_reduction.cuh index b495ccd1222..9ecdfcceb7a 100644 --- a/cuda/base/kernel_launch_reduction.cuh +++ b/cuda/base/kernel_launch_reduction.cuh @@ -148,10 +148,12 @@ void run_kernel_reduction(std::shared_ptr exec, ceildiv(size, block_size), exec->get_num_warps() * oversubscription); if (num_blocks > 1) { Array partial{exec, static_cast(num_blocks)}; - generic_kernel_reduction_1d<<>>( - static_cast(size), fn, op, - [] __device__(auto v) { return v; }, as_cuda_type(identity), - as_cuda_type(partial.get_data()), map_to_device(args)...); + if (num_blocks > 0) { + generic_kernel_reduction_1d<<>>( + static_cast(size), fn, op, + [] __device__(auto v) { return v; }, as_cuda_type(identity), + as_cuda_type(partial.get_data()), map_to_device(args)...); + } generic_kernel_reduction_1d<<<1, block_size>>>( static_cast(num_blocks), [] __device__(auto i, auto v) { return v[i]; }, op, finalize, @@ -181,10 +183,12 @@ void run_kernel_reduction(std::shared_ptr exec, exec->get_num_warps() * oversubscription); if (num_blocks > 1) { Array partial{exec, static_cast(num_blocks)}; - generic_kernel_reduction_2d<<>>( - rows, cols, fn, op, [] __device__(auto v) { return v; }, - as_cuda_type(identity), as_cuda_type(partial.get_data()), - map_to_device(args)...); + if (num_blocks > 0) { + generic_kernel_reduction_2d<<>>( + rows, cols, fn, op, [] __device__(auto v) { return v; }, + as_cuda_type(identity), as_cuda_type(partial.get_data()), + map_to_device(args)...); + } generic_kernel_reduction_1d<<<1, block_size>>>( static_cast(num_blocks), [] __device__(auto i, auto v) { return v[i]; }, op, finalize, @@ -366,10 +370,13 @@ void run_generic_kernel_row_reduction(syn::value_list, { const auto num_blocks = ceildiv(rows * col_blocks * subwarp_size, default_block_size); - generic_kernel_row_reduction_2d - <<>>( - rows, cols, col_blocks, fn, op, finalize, as_cuda_type(identity), - as_cuda_type(result), result_stride, args...); + if (num_blocks > 0) { + generic_kernel_row_reduction_2d + <<>>( + rows, cols, col_blocks, fn, op, finalize, + as_cuda_type(identity), as_cuda_type(result), result_stride, + args...); + } } GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_generic_kernel_row_reduction, @@ -399,16 +406,20 @@ void run_generic_col_reduction_small(syn::value_list, } else { Array tmp_storage{exec, static_cast(num_blocks * cols)}; - generic_kernel_col_reduction_2d_small - <<>>( - rows, cols, fn, op, [] __device__(auto v) { return v; }, - as_cuda_type(identity), as_cuda_type(tmp_storage.get_data()), - args...); - generic_kernel_reduction_finalize_2d<<< - ceildiv(cols, default_block_size), default_block_size>>>( - cols, num_blocks, op, finalize, as_cuda_type(identity), - as_cuda_type(tmp_storage.get_const_data()), 1, - as_cuda_type(result)); + if (num_blocks > 0) { + generic_kernel_col_reduction_2d_small + <<>>( + rows, cols, fn, op, [] __device__(auto v) { return v; }, + as_cuda_type(identity), + as_cuda_type(tmp_storage.get_data()), args...); + } + if (cols > 0) { + generic_kernel_reduction_finalize_2d<<< + ceildiv(cols, default_block_size), default_block_size>>>( + cols, num_blocks, op, finalize, as_cuda_type(identity), + as_cuda_type(tmp_storage.get_const_data()), 1, + as_cuda_type(result)); + } } } @@ -440,17 +451,22 @@ void run_kernel_row_reduction(std::shared_ptr exec, static_cast(col_blocks * rows)}; const auto num_blocks = ceildiv(rows * col_blocks * config::warp_size, default_block_size); - generic_kernel_row_reduction_2d - <<>>( - rows, cols, col_blocks, fn, op, - [] __device__(auto v) { return v; }, as_cuda_type(identity), - as_cuda_type(partial.get_data()), 1, map_to_device(args)...); + if (num_blocks > 0) { + generic_kernel_row_reduction_2d + <<>>( + rows, cols, col_blocks, fn, op, + [] __device__(auto v) { return v; }, as_cuda_type(identity), + as_cuda_type(partial.get_data()), 1, + map_to_device(args)...); + } const auto num_finalize_blocks = ceildiv(rows, default_block_size); - generic_kernel_reduction_finalize_2d<<>>( - rows, col_blocks, op, finalize, as_cuda_type(identity), - as_cuda_type(partial.get_const_data()), - static_cast(result_stride), as_cuda_type(result)); + if (num_finalize_blocks > 0) { + generic_kernel_reduction_finalize_2d<<>>( + rows, col_blocks, op, finalize, as_cuda_type(identity), + as_cuda_type(partial.get_const_data()), + static_cast(result_stride), as_cuda_type(result)); + } } else { select_run_generic_kernel_row_reduction( subwarp_sizes(), @@ -497,23 +513,30 @@ void run_kernel_col_reduction(std::shared_ptr exec, max_blocks), col_blocks); if (row_blocks <= 1) { - generic_kernel_col_reduction_2d_blocked<<>>( - rows, cols, fn, op, finalize, as_cuda_type(identity), - as_cuda_type(result), map_to_device(args)...); + if (col_blocks > 0) { + generic_kernel_col_reduction_2d_blocked<<>>( + rows, cols, fn, op, finalize, as_cuda_type(identity), + as_cuda_type(result), map_to_device(args)...); + } } else { Array tmp_storage{ exec, static_cast(row_blocks * cols)}; - generic_kernel_col_reduction_2d_blocked<<< - dim3(row_blocks, col_blocks), default_block_size>>>( - rows, cols, fn, op, [] __device__(auto v) { return v; }, - as_cuda_type(identity), as_cuda_type(tmp_storage.get_data()), - map_to_device(args)...); - generic_kernel_reduction_finalize_2d<<< - ceildiv(cols, default_block_size), default_block_size>>>( - cols, row_blocks, op, finalize, as_cuda_type(identity), - as_cuda_type(tmp_storage.get_const_data()), 1, - as_cuda_type(result)); + if (row_blocks * col_blocks > 0) { + generic_kernel_col_reduction_2d_blocked<<< + dim3(row_blocks, col_blocks), default_block_size>>>( + rows, cols, fn, op, [] __device__(auto v) { return v; }, + as_cuda_type(identity), + as_cuda_type(tmp_storage.get_data()), + map_to_device(args)...); + } + if (cols > 0) { + generic_kernel_reduction_finalize_2d<<< + ceildiv(cols, default_block_size), default_block_size>>>( + cols, row_blocks, op, finalize, as_cuda_type(identity), + as_cuda_type(tmp_storage.get_const_data()), 1, + as_cuda_type(result)); + } } } } diff --git a/cuda/components/prefix_sum_kernels.cu b/cuda/components/prefix_sum_kernels.cu index decbaecb60d..3a2d1f86455 100644 --- a/cuda/components/prefix_sum_kernels.cu +++ b/cuda/components/prefix_sum_kernels.cu @@ -54,9 +54,11 @@ void prefix_sum(std::shared_ptr exec, IndexType* counts, auto num_blocks = ceildiv(num_entries, prefix_sum_block_size); Array block_sum_array(exec, num_blocks - 1); auto block_sums = block_sum_array.get_data(); - start_prefix_sum - <<>>(num_entries, counts, - block_sums); + if (num_blocks > 0) { + start_prefix_sum + <<>>(num_entries, counts, + block_sums); + } // add the total sum of the previous block only when the number of // blocks is larger than 1. if (num_blocks > 1) { diff --git a/cuda/components/reduction.cuh b/cuda/components/reduction.cuh index 4d73fb736b0..892c84368d7 100644 --- a/cuda/components/reduction.cuh +++ b/cuda/components/reduction.cuh @@ -82,8 +82,11 @@ __host__ ValueType reduce_add_array(std::shared_ptr exec, block_results.resize_and_reset(grid_dim); - reduce_add_array<<>>( - size, as_cuda_type(source), as_cuda_type(block_results.get_data())); + if (grid_dim > 0) { + reduce_add_array<<>>( + size, as_cuda_type(source), + as_cuda_type(block_results.get_data())); + } block_results_val = block_results.get_const_data(); } diff --git a/cuda/factorization/factorization_kernels.cu b/cuda/factorization/factorization_kernels.cu index b4be34c68db..02aa605d9bb 100644 --- a/cuda/factorization/factorization_kernels.cu +++ b/cuda/factorization/factorization_kernels.cu @@ -91,18 +91,20 @@ void add_diagonal_elements(std::shared_ptr exec, const dim3 grid_dim{ static_cast(ceildiv(num_rows, block_dim.x / subwarp_size)), 1, 1}; - if (is_sorted) { - kernel::find_missing_diagonal_elements - <<>>( - num_rows, num_cols, cuda_old_col_idxs, cuda_old_row_ptrs, - cuda_row_ptrs_add, - as_cuda_type(needs_change_device.get_data())); - } else { - kernel::find_missing_diagonal_elements - <<>>( - num_rows, num_cols, cuda_old_col_idxs, cuda_old_row_ptrs, - cuda_row_ptrs_add, - as_cuda_type(needs_change_device.get_data())); + if (num_rows > 0) { + if (is_sorted) { + kernel::find_missing_diagonal_elements + <<>>( + num_rows, num_cols, cuda_old_col_idxs, cuda_old_row_ptrs, + cuda_row_ptrs_add, + as_cuda_type(needs_change_device.get_data())); + } else { + kernel::find_missing_diagonal_elements + <<>>( + num_rows, num_cols, cuda_old_col_idxs, cuda_old_row_ptrs, + cuda_row_ptrs_add, + as_cuda_type(needs_change_device.get_data())); + } } needs_change_host = needs_change_device; if (!needs_change_host.get_const_data()[0]) { @@ -123,6 +125,7 @@ void add_diagonal_elements(std::shared_ptr exec, auto cuda_new_values = as_cuda_type(new_values.get_data()); auto cuda_new_col_idxs = as_cuda_type(new_col_idxs.get_data()); + // no empty kernel guard needed here, we exit earlier already kernel::add_missing_diagonal_elements <<>>(num_rows, cuda_old_values, cuda_old_col_idxs, cuda_old_row_ptrs, cuda_new_values, @@ -155,11 +158,13 @@ void initialize_row_ptrs_l_u( ceildiv(num_rows, static_cast(block_size.x)); const dim3 grid_dim{number_blocks, 1, 1}; - kernel::count_nnz_per_l_u_row<<>>( - num_rows, as_cuda_type(system_matrix->get_const_row_ptrs()), - as_cuda_type(system_matrix->get_const_col_idxs()), - as_cuda_type(system_matrix->get_const_values()), - as_cuda_type(l_row_ptrs), as_cuda_type(u_row_ptrs)); + if (num_rows > 0) { + kernel::count_nnz_per_l_u_row<<>>( + num_rows, as_cuda_type(system_matrix->get_const_row_ptrs()), + as_cuda_type(system_matrix->get_const_col_idxs()), + as_cuda_type(system_matrix->get_const_values()), + as_cuda_type(l_row_ptrs), as_cuda_type(u_row_ptrs)); + } components::prefix_sum(exec, l_row_ptrs, num_rows + 1); components::prefix_sum(exec, u_row_ptrs, num_rows + 1); @@ -181,14 +186,18 @@ void initialize_l_u(std::shared_ptr exec, num_rows, static_cast(block_size.x))), 1, 1}; - kernel::initialize_l_u<<>>( - num_rows, as_cuda_type(system_matrix->get_const_row_ptrs()), - as_cuda_type(system_matrix->get_const_col_idxs()), - as_cuda_type(system_matrix->get_const_values()), - as_cuda_type(csr_l->get_const_row_ptrs()), - as_cuda_type(csr_l->get_col_idxs()), as_cuda_type(csr_l->get_values()), - as_cuda_type(csr_u->get_const_row_ptrs()), - as_cuda_type(csr_u->get_col_idxs()), as_cuda_type(csr_u->get_values())); + if (num_rows > 0) { + kernel::initialize_l_u<<>>( + num_rows, as_cuda_type(system_matrix->get_const_row_ptrs()), + as_cuda_type(system_matrix->get_const_col_idxs()), + as_cuda_type(system_matrix->get_const_values()), + as_cuda_type(csr_l->get_const_row_ptrs()), + as_cuda_type(csr_l->get_col_idxs()), + as_cuda_type(csr_l->get_values()), + as_cuda_type(csr_u->get_const_row_ptrs()), + as_cuda_type(csr_u->get_col_idxs()), + as_cuda_type(csr_u->get_values())); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -208,11 +217,13 @@ void initialize_row_ptrs_l( ceildiv(num_rows, static_cast(block_size.x)); const dim3 grid_dim{number_blocks, 1, 1}; - kernel::count_nnz_per_l_row<<>>( - num_rows, as_cuda_type(system_matrix->get_const_row_ptrs()), - as_cuda_type(system_matrix->get_const_col_idxs()), - as_cuda_type(system_matrix->get_const_values()), - as_cuda_type(l_row_ptrs)); + if (num_rows > 0) { + kernel::count_nnz_per_l_row<<>>( + num_rows, as_cuda_type(system_matrix->get_const_row_ptrs()), + as_cuda_type(system_matrix->get_const_col_idxs()), + as_cuda_type(system_matrix->get_const_values()), + as_cuda_type(l_row_ptrs)); + } components::prefix_sum(exec, l_row_ptrs, num_rows + 1); } @@ -232,13 +243,15 @@ void initialize_l(std::shared_ptr exec, num_rows, static_cast(block_size.x))), 1, 1}; - kernel::initialize_l<<>>( - num_rows, as_cuda_type(system_matrix->get_const_row_ptrs()), - as_cuda_type(system_matrix->get_const_col_idxs()), - as_cuda_type(system_matrix->get_const_values()), - as_cuda_type(csr_l->get_const_row_ptrs()), - as_cuda_type(csr_l->get_col_idxs()), as_cuda_type(csr_l->get_values()), - diag_sqrt); + if (num_rows > 0) { + kernel::initialize_l<<>>( + num_rows, as_cuda_type(system_matrix->get_const_row_ptrs()), + as_cuda_type(system_matrix->get_const_col_idxs()), + as_cuda_type(system_matrix->get_const_values()), + as_cuda_type(csr_l->get_const_row_ptrs()), + as_cuda_type(csr_l->get_col_idxs()), + as_cuda_type(csr_l->get_values()), diag_sqrt); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( diff --git a/cuda/factorization/par_ic_kernels.cu b/cuda/factorization/par_ic_kernels.cu index 9f1b98c23c7..4318cd1e1d9 100644 --- a/cuda/factorization/par_ic_kernels.cu +++ b/cuda/factorization/par_ic_kernels.cu @@ -73,8 +73,10 @@ void init_factor(std::shared_ptr exec, auto num_blocks = ceildiv(num_rows, default_block_size); auto l_row_ptrs = l->get_const_row_ptrs(); auto l_vals = l->get_values(); - kernel::ic_init<<>>( - l_row_ptrs, as_cuda_type(l_vals), num_rows); + if (num_rows > 0) { + kernel::ic_init<<>>( + l_row_ptrs, as_cuda_type(l_vals), num_rows); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -90,11 +92,14 @@ void compute_factor(std::shared_ptr exec, auto nnz = l->get_num_stored_elements(); auto num_blocks = ceildiv(nnz, default_block_size); for (size_type i = 0; i < iterations; ++i) { - kernel::ic_sweep<<>>( - a_lower->get_const_row_idxs(), a_lower->get_const_col_idxs(), - as_cuda_type(a_lower->get_const_values()), l->get_const_row_ptrs(), - l->get_const_col_idxs(), as_cuda_type(l->get_values()), - static_cast(l->get_num_stored_elements())); + if (num_blocks > 0) { + kernel::ic_sweep<<>>( + a_lower->get_const_row_idxs(), a_lower->get_const_col_idxs(), + as_cuda_type(a_lower->get_const_values()), + l->get_const_row_ptrs(), l->get_const_col_idxs(), + as_cuda_type(l->get_values()), + static_cast(l->get_num_stored_elements())); + } } } diff --git a/cuda/factorization/par_ict_kernels.cu b/cuda/factorization/par_ict_kernels.cu index 07b63dc58a0..2f65affb628 100644 --- a/cuda/factorization/par_ict_kernels.cu +++ b/cuda/factorization/par_ict_kernels.cu @@ -103,10 +103,12 @@ void add_candidates(syn::value_list, auto l_vals = l->get_const_values(); auto l_new_row_ptrs = l_new->get_row_ptrs(); // count non-zeros per row - kernel::ict_tri_spgeam_nnz - <<>>(llh_row_ptrs, llh_col_idxs, - a_row_ptrs, a_col_idxs, - l_new_row_ptrs, num_rows); + if (num_blocks > 0) { + kernel::ict_tri_spgeam_nnz + <<>>(llh_row_ptrs, llh_col_idxs, + a_row_ptrs, a_col_idxs, + l_new_row_ptrs, num_rows); + } // build row ptrs components::prefix_sum(exec, l_new_row_ptrs, num_rows + 1); @@ -120,12 +122,14 @@ void add_candidates(syn::value_list, auto l_new_vals = l_new->get_values(); // fill columns and values - kernel::ict_tri_spgeam_init - <<>>( - llh_row_ptrs, llh_col_idxs, as_cuda_type(llh_vals), a_row_ptrs, - a_col_idxs, as_cuda_type(a_vals), l_row_ptrs, l_col_idxs, - as_cuda_type(l_vals), l_new_row_ptrs, l_new_col_idxs, - as_cuda_type(l_new_vals), num_rows); + if (num_blocks > 0) { + kernel::ict_tri_spgeam_init + <<>>( + llh_row_ptrs, llh_col_idxs, as_cuda_type(llh_vals), a_row_ptrs, + a_col_idxs, as_cuda_type(a_vals), l_row_ptrs, l_col_idxs, + as_cuda_type(l_vals), l_new_row_ptrs, l_new_col_idxs, + as_cuda_type(l_new_vals), num_rows); + } } @@ -142,12 +146,14 @@ void compute_factor(syn::value_list, auto total_nnz = static_cast(l->get_num_stored_elements()); auto block_size = default_block_size / subwarp_size; auto num_blocks = ceildiv(total_nnz, block_size); - kernel::ict_sweep<<>>( - a->get_const_row_ptrs(), a->get_const_col_idxs(), - as_cuda_type(a->get_const_values()), l->get_const_row_ptrs(), - l_coo->get_const_row_idxs(), l->get_const_col_idxs(), - as_cuda_type(l->get_values()), - static_cast(l->get_num_stored_elements())); + if (num_blocks > 0) { + kernel::ict_sweep<<>>( + a->get_const_row_ptrs(), a->get_const_col_idxs(), + as_cuda_type(a->get_const_values()), l->get_const_row_ptrs(), + l_coo->get_const_row_idxs(), l->get_const_col_idxs(), + as_cuda_type(l->get_values()), + static_cast(l->get_num_stored_elements())); + } } diff --git a/cuda/factorization/par_ilu_kernels.cu b/cuda/factorization/par_ilu_kernels.cu index 1a29c70dd67..5f5f39d3ce9 100644 --- a/cuda/factorization/par_ilu_kernels.cu +++ b/cuda/factorization/par_ilu_kernels.cu @@ -73,14 +73,16 @@ void compute_l_u_factors(std::shared_ptr exec, ceildiv(num_elements, static_cast(block_size.x))), 1, 1}; for (size_type i = 0; i < iterations; ++i) { - kernel::compute_l_u_factors<<>>( - num_elements, system_matrix->get_const_row_idxs(), - system_matrix->get_const_col_idxs(), - as_cuda_type(system_matrix->get_const_values()), - l_factor->get_const_row_ptrs(), l_factor->get_const_col_idxs(), - as_cuda_type(l_factor->get_values()), - u_factor->get_const_row_ptrs(), u_factor->get_const_col_idxs(), - as_cuda_type(u_factor->get_values())); + if (grid_dim.x > 0) { + kernel::compute_l_u_factors<<>>( + num_elements, system_matrix->get_const_row_idxs(), + system_matrix->get_const_col_idxs(), + as_cuda_type(system_matrix->get_const_values()), + l_factor->get_const_row_ptrs(), l_factor->get_const_col_idxs(), + as_cuda_type(l_factor->get_values()), + u_factor->get_const_row_ptrs(), u_factor->get_const_col_idxs(), + as_cuda_type(u_factor->get_values())); + } } } diff --git a/cuda/factorization/par_ilut_approx_filter_kernel.cu b/cuda/factorization/par_ilut_approx_filter_kernel.cu index ced5a5a13dc..be86588a909 100644 --- a/cuda/factorization/par_ilut_approx_filter_kernel.cu +++ b/cuda/factorization/par_ilut_approx_filter_kernel.cu @@ -139,8 +139,11 @@ void threshold_filter_approx(syn::value_list, auto block_size = default_block_size / subwarp_size; auto num_blocks = ceildiv(num_rows, block_size); auto new_row_ptrs = m_out->get_row_ptrs(); - kernel::bucket_filter_nnz<<>>( - old_row_ptrs, oracles, num_rows, bucket, new_row_ptrs); + if (num_blocks > 0) { + kernel::bucket_filter_nnz + <<>>( + old_row_ptrs, oracles, num_rows, bucket, new_row_ptrs); + } // build row pointers components::prefix_sum(exec, new_row_ptrs, num_rows + 1); @@ -163,10 +166,12 @@ void threshold_filter_approx(syn::value_list, Array::view(exec, new_nnz, new_vals); new_row_idxs = m_out_coo->get_row_idxs(); } - kernel::bucket_filter<<>>( - old_row_ptrs, old_col_idxs, as_cuda_type(old_vals), oracles, num_rows, - bucket, new_row_ptrs, new_row_idxs, new_col_idxs, - as_cuda_type(new_vals)); + if (num_blocks > 0) { + kernel::bucket_filter<<>>( + old_row_ptrs, old_col_idxs, as_cuda_type(old_vals), oracles, + num_rows, bucket, new_row_ptrs, new_row_idxs, new_col_idxs, + as_cuda_type(new_vals)); + } } diff --git a/cuda/factorization/par_ilut_filter_kernel.cu b/cuda/factorization/par_ilut_filter_kernel.cu index 45a7efe8ec5..6dd83c41835 100644 --- a/cuda/factorization/par_ilut_filter_kernel.cu +++ b/cuda/factorization/par_ilut_filter_kernel.cu @@ -94,10 +94,12 @@ void threshold_filter(syn::value_list, auto block_size = default_block_size / subwarp_size; auto num_blocks = ceildiv(num_rows, block_size); auto new_row_ptrs = m_out->get_row_ptrs(); - kernel::threshold_filter_nnz - <<>>(old_row_ptrs, - as_cuda_type(old_vals), num_rows, - threshold, new_row_ptrs, lower); + if (num_blocks > 0) { + kernel::threshold_filter_nnz + <<>>( + old_row_ptrs, as_cuda_type(old_vals), num_rows, threshold, + new_row_ptrs, lower); + } // build row pointers components::prefix_sum(exec, new_row_ptrs, num_rows + 1); @@ -120,10 +122,13 @@ void threshold_filter(syn::value_list, Array::view(exec, new_nnz, new_vals); new_row_idxs = m_out_coo->get_row_idxs(); } - kernel::threshold_filter<<>>( - old_row_ptrs, old_col_idxs, as_cuda_type(old_vals), num_rows, threshold, - new_row_ptrs, new_row_idxs, new_col_idxs, as_cuda_type(new_vals), - lower); + if (num_blocks > 0) { + kernel::threshold_filter + <<>>( + old_row_ptrs, old_col_idxs, as_cuda_type(old_vals), num_rows, + threshold, new_row_ptrs, new_row_idxs, new_col_idxs, + as_cuda_type(new_vals), lower); + } } diff --git a/cuda/factorization/par_ilut_select_common.cu b/cuda/factorization/par_ilut_select_common.cu index 959d7abadd8..870ccb69d91 100644 --- a/cuda/factorization/par_ilut_select_common.cu +++ b/cuda/factorization/par_ilut_select_common.cu @@ -72,9 +72,11 @@ void sampleselect_count(std::shared_ptr exec, kernel::build_searchtree<<<1, bucket_count>>>(as_cuda_type(values), size, tree); // determine bucket sizes - kernel::count_buckets<<>>( - as_cuda_type(values), size, tree, partial_counts, oracles, - items_per_thread); + if (num_blocks > 0) { + kernel::count_buckets<<>>( + as_cuda_type(values), size, tree, partial_counts, oracles, + items_per_thread); + } // compute prefix sum and total sum over block-local values kernel::block_prefix_sum<<>>( partial_counts, total_counts, num_blocks); diff --git a/cuda/factorization/par_ilut_select_kernel.cu b/cuda/factorization/par_ilut_select_kernel.cu index 247518ac3e1..6655db78296 100644 --- a/cuda/factorization/par_ilut_select_kernel.cu +++ b/cuda/factorization/par_ilut_select_kernel.cu @@ -75,9 +75,11 @@ void sampleselect_filter(const ValueType* values, IndexType size, auto num_threads_total = ceildiv(size, items_per_thread); auto num_blocks = static_cast(ceildiv(num_threads_total, default_block_size)); - kernel::filter_bucket<<>>( - as_cuda_type(values), size, bucket, oracles, partial_counts, out, - items_per_thread); + if (num_blocks > 0) { + kernel::filter_bucket<<>>( + as_cuda_type(values), size, bucket, oracles, partial_counts, out, + items_per_thread); + } } diff --git a/cuda/factorization/par_ilut_spgeam_kernel.cu b/cuda/factorization/par_ilut_spgeam_kernel.cu index 2dd5a236a91..f1a6b7518ed 100644 --- a/cuda/factorization/par_ilut_spgeam_kernel.cu +++ b/cuda/factorization/par_ilut_spgeam_kernel.cu @@ -109,9 +109,12 @@ void add_candidates(syn::value_list, auto l_new_row_ptrs = l_new->get_row_ptrs(); auto u_new_row_ptrs = u_new->get_row_ptrs(); // count non-zeros per row - kernel::tri_spgeam_nnz<<>>( - lu_row_ptrs, lu_col_idxs, a_row_ptrs, a_col_idxs, l_new_row_ptrs, - u_new_row_ptrs, num_rows); + if (num_blocks > 0) { + kernel::tri_spgeam_nnz + <<>>( + lu_row_ptrs, lu_col_idxs, a_row_ptrs, a_col_idxs, + l_new_row_ptrs, u_new_row_ptrs, num_rows); + } // build row ptrs components::prefix_sum(exec, l_new_row_ptrs, num_rows + 1); @@ -131,12 +134,16 @@ void add_candidates(syn::value_list, auto u_new_vals = u_new->get_values(); // fill columns and values - kernel::tri_spgeam_init<<>>( - lu_row_ptrs, lu_col_idxs, as_cuda_type(lu_vals), a_row_ptrs, a_col_idxs, - as_cuda_type(a_vals), l_row_ptrs, l_col_idxs, as_cuda_type(l_vals), - u_row_ptrs, u_col_idxs, as_cuda_type(u_vals), l_new_row_ptrs, - l_new_col_idxs, as_cuda_type(l_new_vals), u_new_row_ptrs, - u_new_col_idxs, as_cuda_type(u_new_vals), num_rows); + if (num_blocks > 0) { + kernel::tri_spgeam_init + <<>>( + lu_row_ptrs, lu_col_idxs, as_cuda_type(lu_vals), a_row_ptrs, + a_col_idxs, as_cuda_type(a_vals), l_row_ptrs, l_col_idxs, + as_cuda_type(l_vals), u_row_ptrs, u_col_idxs, + as_cuda_type(u_vals), l_new_row_ptrs, l_new_col_idxs, + as_cuda_type(l_new_vals), u_new_row_ptrs, u_new_col_idxs, + as_cuda_type(u_new_vals), num_rows); + } } diff --git a/cuda/factorization/par_ilut_sweep_kernel.cu b/cuda/factorization/par_ilut_sweep_kernel.cu index 7f2d6e03201..fb1248158f9 100644 --- a/cuda/factorization/par_ilut_sweep_kernel.cu +++ b/cuda/factorization/par_ilut_sweep_kernel.cu @@ -93,16 +93,18 @@ void compute_l_u_factors(syn::value_list, u->get_num_stored_elements()); auto block_size = default_block_size / subwarp_size; auto num_blocks = ceildiv(total_nnz, block_size); - kernel::sweep<<>>( - a->get_const_row_ptrs(), a->get_const_col_idxs(), - as_cuda_type(a->get_const_values()), l->get_const_row_ptrs(), - l_coo->get_const_row_idxs(), l->get_const_col_idxs(), - as_cuda_type(l->get_values()), - static_cast(l->get_num_stored_elements()), - u_coo->get_const_row_idxs(), u_coo->get_const_col_idxs(), - as_cuda_type(u->get_values()), u_csc->get_const_row_ptrs(), - u_csc->get_const_col_idxs(), as_cuda_type(u_csc->get_values()), - static_cast(u->get_num_stored_elements())); + if (num_blocks > 0) { + kernel::sweep<<>>( + a->get_const_row_ptrs(), a->get_const_col_idxs(), + as_cuda_type(a->get_const_values()), l->get_const_row_ptrs(), + l_coo->get_const_row_idxs(), l->get_const_col_idxs(), + as_cuda_type(l->get_values()), + static_cast(l->get_num_stored_elements()), + u_coo->get_const_row_idxs(), u_coo->get_const_col_idxs(), + as_cuda_type(u->get_values()), u_csc->get_const_row_ptrs(), + u_csc->get_const_col_idxs(), as_cuda_type(u_csc->get_values()), + static_cast(u->get_num_stored_elements())); + } } GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_l_u_factors, diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.cu index ce746cc6364..00f38a78b7b 100644 --- a/cuda/matrix/csr_kernels.cu +++ b/cuda/matrix/csr_kernels.cu @@ -125,16 +125,18 @@ void merge_path_spmv(syn::value_list, if (alpha == nullptr && beta == nullptr) { const auto b_vals = b->get_const_values() + column_id; auto c_vals = c->get_values() + column_id; - kernel::abstract_merge_path_spmv - <<>>( - static_cast(a->get_size()[0]), - as_cuda_type(a->get_const_values()), - a->get_const_col_idxs(), - as_cuda_type(a->get_const_row_ptrs()), - as_cuda_type(a->get_const_srow()), as_cuda_type(b_vals), - b->get_stride(), as_cuda_type(c_vals), c->get_stride(), - as_cuda_type(row_out.get_data()), - as_cuda_type(val_out.get_data())); + if (grid_num > 0) { + kernel::abstract_merge_path_spmv + <<>>( + static_cast(a->get_size()[0]), + as_cuda_type(a->get_const_values()), + a->get_const_col_idxs(), + as_cuda_type(a->get_const_row_ptrs()), + as_cuda_type(a->get_const_srow()), as_cuda_type(b_vals), + b->get_stride(), as_cuda_type(c_vals), c->get_stride(), + as_cuda_type(row_out.get_data()), + as_cuda_type(val_out.get_data())); + } kernel::abstract_reduce<<<1, spmv_block_size>>>( grid_num, as_cuda_type(val_out.get_data()), as_cuda_type(row_out.get_data()), as_cuda_type(c_vals), @@ -143,18 +145,20 @@ void merge_path_spmv(syn::value_list, } else if (alpha != nullptr && beta != nullptr) { const auto b_vals = b->get_const_values() + column_id; auto c_vals = c->get_values() + column_id; - kernel::abstract_merge_path_spmv - <<>>( - static_cast(a->get_size()[0]), - as_cuda_type(alpha->get_const_values()), - as_cuda_type(a->get_const_values()), - a->get_const_col_idxs(), - as_cuda_type(a->get_const_row_ptrs()), - as_cuda_type(a->get_const_srow()), as_cuda_type(b_vals), - b->get_stride(), as_cuda_type(beta->get_const_values()), - as_cuda_type(c_vals), c->get_stride(), - as_cuda_type(row_out.get_data()), - as_cuda_type(val_out.get_data())); + if (grid_num > 0) { + kernel::abstract_merge_path_spmv + <<>>( + static_cast(a->get_size()[0]), + as_cuda_type(alpha->get_const_values()), + as_cuda_type(a->get_const_values()), + a->get_const_col_idxs(), + as_cuda_type(a->get_const_row_ptrs()), + as_cuda_type(a->get_const_srow()), as_cuda_type(b_vals), + b->get_stride(), as_cuda_type(beta->get_const_values()), + as_cuda_type(c_vals), c->get_stride(), + as_cuda_type(row_out.get_data()), + as_cuda_type(val_out.get_data())); + } kernel::abstract_reduce<<<1, spmv_block_size>>>( grid_num, as_cuda_type(val_out.get_data()), as_cuda_type(row_out.get_data()), @@ -226,20 +230,27 @@ void classical_spmv(syn::value_list, const dim3 block(spmv_block_size); if (alpha == nullptr && beta == nullptr) { - kernel::abstract_classical_spmv<<>>( - a->get_size()[0], as_cuda_type(a->get_const_values()), - a->get_const_col_idxs(), as_cuda_type(a->get_const_row_ptrs()), - as_cuda_type(b->get_const_values()), b->get_stride(), - as_cuda_type(c->get_values()), c->get_stride()); - + if (grid.x * grid.y > 0) { + kernel::abstract_classical_spmv + <<>>( + a->get_size()[0], as_cuda_type(a->get_const_values()), + a->get_const_col_idxs(), + as_cuda_type(a->get_const_row_ptrs()), + as_cuda_type(b->get_const_values()), b->get_stride(), + as_cuda_type(c->get_values()), c->get_stride()); + } } else if (alpha != nullptr && beta != nullptr) { - kernel::abstract_classical_spmv<<>>( - a->get_size()[0], as_cuda_type(alpha->get_const_values()), - as_cuda_type(a->get_const_values()), a->get_const_col_idxs(), - as_cuda_type(a->get_const_row_ptrs()), - as_cuda_type(b->get_const_values()), b->get_stride(), - as_cuda_type(beta->get_const_values()), - as_cuda_type(c->get_values()), c->get_stride()); + if (grid.x * grid.y > 0) { + kernel::abstract_classical_spmv + <<>>( + a->get_size()[0], as_cuda_type(alpha->get_const_values()), + as_cuda_type(a->get_const_values()), + a->get_const_col_idxs(), + as_cuda_type(a->get_const_row_ptrs()), + as_cuda_type(b->get_const_values()), b->get_stride(), + as_cuda_type(beta->get_const_values()), + as_cuda_type(c->get_values()), c->get_stride()); + } } else { GKO_KERNEL_NOT_FOUND; } @@ -266,24 +277,32 @@ void load_balance_spmv(std::shared_ptr exec, const dim3 csr_block(config::warp_size, warps_in_block, 1); const dim3 csr_grid(ceildiv(nwarps, warps_in_block), b->get_size()[1]); if (alpha) { - kernel::abstract_spmv<<>>( - nwarps, static_cast(a->get_size()[0]), - as_cuda_type(alpha->get_const_values()), - as_cuda_type(a->get_const_values()), a->get_const_col_idxs(), - as_cuda_type(a->get_const_row_ptrs()), - as_cuda_type(a->get_const_srow()), - as_cuda_type(b->get_const_values()), - as_cuda_type(b->get_stride()), as_cuda_type(c->get_values()), - as_cuda_type(c->get_stride())); + if (csr_grid.x * csr_grid.y > 0) { + kernel::abstract_spmv<<>>( + nwarps, static_cast(a->get_size()[0]), + as_cuda_type(alpha->get_const_values()), + as_cuda_type(a->get_const_values()), + a->get_const_col_idxs(), + as_cuda_type(a->get_const_row_ptrs()), + as_cuda_type(a->get_const_srow()), + as_cuda_type(b->get_const_values()), + as_cuda_type(b->get_stride()), + as_cuda_type(c->get_values()), + as_cuda_type(c->get_stride())); + } } else { - kernel::abstract_spmv<<>>( - nwarps, static_cast(a->get_size()[0]), - as_cuda_type(a->get_const_values()), a->get_const_col_idxs(), - as_cuda_type(a->get_const_row_ptrs()), - as_cuda_type(a->get_const_srow()), - as_cuda_type(b->get_const_values()), - as_cuda_type(b->get_stride()), as_cuda_type(c->get_values()), - as_cuda_type(c->get_stride())); + if (csr_grid.x * csr_grid.y > 0) { + kernel::abstract_spmv<<>>( + nwarps, static_cast(a->get_size()[0]), + as_cuda_type(a->get_const_values()), + a->get_const_col_idxs(), + as_cuda_type(a->get_const_row_ptrs()), + as_cuda_type(a->get_const_srow()), + as_cuda_type(b->get_const_values()), + as_cuda_type(b->get_stride()), + as_cuda_type(c->get_values()), + as_cuda_type(c->get_stride())); + } } } else { GKO_NOT_SUPPORTED(nwarps); @@ -651,8 +670,10 @@ void spgeam(syn::value_list, // count nnz for alpha * A + beta * B auto subwarps_per_block = default_block_size / subwarp_size; auto num_blocks = ceildiv(m, subwarps_per_block); - kernel::spgeam_nnz<<>>( - a_row_ptrs, a_col_idxs, b_row_ptrs, b_col_idxs, m, c_row_ptrs); + if (num_blocks > 0) { + kernel::spgeam_nnz<<>>( + a_row_ptrs, a_col_idxs, b_row_ptrs, b_col_idxs, m, c_row_ptrs); + } // build row pointers components::prefix_sum(exec, c_row_ptrs, m + 1); @@ -664,10 +685,12 @@ void spgeam(syn::value_list, c_builder.get_value_array().resize_and_reset(c_nnz); auto c_col_idxs = c->get_col_idxs(); auto c_vals = c->get_values(); - kernel::spgeam<<>>( - as_cuda_type(alpha), a_row_ptrs, a_col_idxs, as_cuda_type(a_vals), - as_cuda_type(beta), b_row_ptrs, b_col_idxs, as_cuda_type(b_vals), m, - c_row_ptrs, c_col_idxs, as_cuda_type(c_vals)); + if (num_blocks > 0) { + kernel::spgeam<<>>( + as_cuda_type(alpha), a_row_ptrs, a_col_idxs, as_cuda_type(a_vals), + as_cuda_type(beta), b_row_ptrs, b_col_idxs, as_cuda_type(b_vals), m, + c_row_ptrs, c_col_idxs, as_cuda_type(c_vals)); + } } GKO_ENABLE_IMPLEMENTATION_SELECTION(select_spgeam, spgeam); @@ -865,9 +888,11 @@ void fill_in_dense(std::shared_ptr exec, const auto vals = source->get_const_values(); auto grid_dim = ceildiv(num_rows, default_block_size); - kernel::fill_in_dense<<>>( - num_rows, as_cuda_type(row_ptrs), as_cuda_type(col_idxs), - as_cuda_type(vals), stride, as_cuda_type(result->get_values())); + if (grid_dim > 0) { + kernel::fill_in_dense<<>>( + num_rows, as_cuda_type(row_ptrs), as_cuda_type(col_idxs), + as_cuda_type(vals), stride, as_cuda_type(result->get_values())); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -966,10 +991,11 @@ void conj_transpose(std::shared_ptr exec, trans->get_row_ptrs(), trans->get_col_idxs(), cu_value, copyValues, idxBase, alg, buffer); #endif - - conjugate_kernel<<>>( - trans->get_num_stored_elements(), - as_cuda_type(trans->get_values())); + if (grid_size.x > 0) { + conjugate_kernel<<>>( + trans->get_num_stored_elements(), + as_cuda_type(trans->get_values())); + } } else { GKO_NOT_IMPLEMENTED; } @@ -987,17 +1013,23 @@ void inv_symm_permute(std::shared_ptr exec, { auto num_rows = orig->get_size()[0]; auto count_num_blocks = ceildiv(num_rows, default_block_size); - inv_row_ptr_permute_kernel<<>>( - num_rows, perm, orig->get_const_row_ptrs(), permuted->get_row_ptrs()); + if (count_num_blocks > 0) { + inv_row_ptr_permute_kernel<<>>( + num_rows, perm, orig->get_const_row_ptrs(), + permuted->get_row_ptrs()); + } components::prefix_sum(exec, permuted->get_row_ptrs(), num_rows + 1); auto copy_num_blocks = ceildiv(num_rows, default_block_size / config::warp_size); - inv_symm_permute_kernel - <<>>( - num_rows, perm, orig->get_const_row_ptrs(), - orig->get_const_col_idxs(), as_cuda_type(orig->get_const_values()), - permuted->get_row_ptrs(), permuted->get_col_idxs(), - as_cuda_type(permuted->get_values())); + if (copy_num_blocks > 0) { + inv_symm_permute_kernel + <<>>( + num_rows, perm, orig->get_const_row_ptrs(), + orig->get_const_col_idxs(), + as_cuda_type(orig->get_const_values()), + permuted->get_row_ptrs(), permuted->get_col_idxs(), + as_cuda_type(permuted->get_values())); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -1012,18 +1044,23 @@ void row_permute(std::shared_ptr exec, { auto num_rows = orig->get_size()[0]; auto count_num_blocks = ceildiv(num_rows, default_block_size); - row_ptr_permute_kernel<<>>( - num_rows, perm, orig->get_const_row_ptrs(), - row_permuted->get_row_ptrs()); + if (count_num_blocks > 0) { + row_ptr_permute_kernel<<>>( + num_rows, perm, orig->get_const_row_ptrs(), + row_permuted->get_row_ptrs()); + } components::prefix_sum(exec, row_permuted->get_row_ptrs(), num_rows + 1); auto copy_num_blocks = ceildiv(num_rows, default_block_size / config::warp_size); - row_permute_kernel - <<>>( - num_rows, perm, orig->get_const_row_ptrs(), - orig->get_const_col_idxs(), as_cuda_type(orig->get_const_values()), - row_permuted->get_row_ptrs(), row_permuted->get_col_idxs(), - as_cuda_type(row_permuted->get_values())); + if (copy_num_blocks > 0) { + row_permute_kernel + <<>>( + num_rows, perm, orig->get_const_row_ptrs(), + orig->get_const_col_idxs(), + as_cuda_type(orig->get_const_values()), + row_permuted->get_row_ptrs(), row_permuted->get_col_idxs(), + as_cuda_type(row_permuted->get_values())); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -1038,18 +1075,23 @@ void inverse_row_permute(std::shared_ptr exec, { auto num_rows = orig->get_size()[0]; auto count_num_blocks = ceildiv(num_rows, default_block_size); - inv_row_ptr_permute_kernel<<>>( - num_rows, perm, orig->get_const_row_ptrs(), - row_permuted->get_row_ptrs()); + if (count_num_blocks > 0) { + inv_row_ptr_permute_kernel<<>>( + num_rows, perm, orig->get_const_row_ptrs(), + row_permuted->get_row_ptrs()); + } components::prefix_sum(exec, row_permuted->get_row_ptrs(), num_rows + 1); auto copy_num_blocks = ceildiv(num_rows, default_block_size / config::warp_size); - inv_row_permute_kernel - <<>>( - num_rows, perm, orig->get_const_row_ptrs(), - orig->get_const_col_idxs(), as_cuda_type(orig->get_const_values()), - row_permuted->get_row_ptrs(), row_permuted->get_col_idxs(), - as_cuda_type(row_permuted->get_values())); + if (copy_num_blocks > 0) { + inv_row_permute_kernel + <<>>( + num_rows, perm, orig->get_const_row_ptrs(), + orig->get_const_col_idxs(), + as_cuda_type(orig->get_const_values()), + row_permuted->get_row_ptrs(), row_permuted->get_col_idxs(), + as_cuda_type(row_permuted->get_values())); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -1066,10 +1108,11 @@ void calculate_nonzeros_per_row_in_span( auto row_ptrs = source->get_const_row_ptrs(); auto col_idxs = source->get_const_col_idxs(); auto grid_dim = ceildiv(row_span.length(), default_block_size); - - kernel::calculate_nnz_per_row_in_span<<>>( - row_span, col_span, as_cuda_type(row_ptrs), as_cuda_type(col_idxs), - as_cuda_type(row_nnz->get_data())); + if (grid_dim > 0) { + kernel::calculate_nnz_per_row_in_span<<>>( + row_span, col_span, as_cuda_type(row_ptrs), as_cuda_type(col_idxs), + as_cuda_type(row_nnz->get_data())); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -1091,14 +1134,17 @@ void compute_submatrix(std::shared_ptr exec, auto num_nnz = source->get_num_stored_elements(); grid_dim = ceildiv(num_nnz, default_block_size); - kernel::compute_submatrix_idxs_and_vals<<>>( - num_rows, num_cols, num_nnz, row_offset, col_offset, - as_cuda_type(source->get_const_row_ptrs()), - as_cuda_type(source->get_const_col_idxs()), - as_cuda_type(source->get_const_values()), - as_cuda_type(result->get_const_row_ptrs()), - as_cuda_type(result->get_col_idxs()), - as_cuda_type(result->get_values())); + if (grid_dim > 0) { + kernel:: + compute_submatrix_idxs_and_vals<<>>( + num_rows, num_cols, num_nnz, row_offset, col_offset, + as_cuda_type(source->get_const_row_ptrs()), + as_cuda_type(source->get_const_col_idxs()), + as_cuda_type(source->get_const_values()), + as_cuda_type(result->get_const_row_ptrs()), + as_cuda_type(result->get_col_idxs()), + as_cuda_type(result->get_values())); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -1171,9 +1217,11 @@ void is_sorted_by_column_index( auto block_size = default_block_size; auto num_rows = static_cast(to_check->get_size()[0]); auto num_blocks = ceildiv(num_rows, block_size); - kernel::check_unsorted<<>>( - to_check->get_const_row_ptrs(), to_check->get_const_col_idxs(), - num_rows, gpu_array.get_data()); + if (num_blocks > 0) { + kernel::check_unsorted<<>>( + to_check->get_const_row_ptrs(), to_check->get_const_col_idxs(), + num_rows, gpu_array.get_data()); + } cpu_array = gpu_array; } @@ -1196,9 +1244,12 @@ void extract_diagonal(std::shared_ptr exec, const auto orig_col_idxs = orig->get_const_col_idxs(); auto diag_values = diag->get_values(); - kernel::extract_diagonal<<>>( - diag_size, nnz, as_cuda_type(orig_values), as_cuda_type(orig_row_ptrs), - as_cuda_type(orig_col_idxs), as_cuda_type(diag_values)); + if (num_blocks > 0) { + kernel::extract_diagonal<<>>( + diag_size, nnz, as_cuda_type(orig_values), + as_cuda_type(orig_row_ptrs), as_cuda_type(orig_col_idxs), + as_cuda_type(diag_values)); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_EXTRACT_DIAGONAL); diff --git a/cuda/matrix/diagonal_kernels.cu b/cuda/matrix/diagonal_kernels.cu index 62f8603f499..ea0bcfc2762 100644 --- a/cuda/matrix/diagonal_kernels.cu +++ b/cuda/matrix/diagonal_kernels.cu @@ -74,9 +74,11 @@ void apply_to_csr(std::shared_ptr exec, const auto grid_dim = ceildiv(num_rows * config::warp_size, default_block_size); - kernel::apply_to_csr<<>>( - num_rows, as_cuda_type(diag_values), as_cuda_type(csr_row_ptrs), - as_cuda_type(csr_values)); + if (grid_dim > 0) { + kernel::apply_to_csr<<>>( + num_rows, as_cuda_type(diag_values), as_cuda_type(csr_row_ptrs), + as_cuda_type(csr_values)); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( diff --git a/cuda/matrix/ell_kernels.cu b/cuda/matrix/ell_kernels.cu index 8537e44c831..14d7f1486bd 100644 --- a/cuda/matrix/ell_kernels.cu +++ b/cuda/matrix/ell_kernels.cu @@ -151,22 +151,26 @@ void abstract_spmv(syn::value_list, int num_worker_per_row, {static_cast(b->get_stride())}}); if (alpha == nullptr && beta == nullptr) { - kernel::spmv - <<>>( - nrows, num_worker_per_row, acc::as_cuda_range(a_vals), - a->get_const_col_idxs(), stride, num_stored_elements_per_row, - acc::as_cuda_range(b_vals), as_cuda_type(c->get_values()), - c->get_stride()); + if (grid_size.x * grid_size.y > 0) { + kernel::spmv + <<>>( + nrows, num_worker_per_row, acc::as_cuda_range(a_vals), + a->get_const_col_idxs(), stride, + num_stored_elements_per_row, acc::as_cuda_range(b_vals), + as_cuda_type(c->get_values()), c->get_stride()); + } } else if (alpha != nullptr && beta != nullptr) { const auto alpha_val = gko::acc::range( std::array{1}, alpha->get_const_values()); - kernel::spmv - <<>>( - nrows, num_worker_per_row, acc::as_cuda_range(alpha_val), - acc::as_cuda_range(a_vals), a->get_const_col_idxs(), stride, - num_stored_elements_per_row, acc::as_cuda_range(b_vals), - as_cuda_type(beta->get_const_values()), - as_cuda_type(c->get_values()), c->get_stride()); + if (grid_size.x * grid_size.y > 0) { + kernel::spmv + <<>>( + nrows, num_worker_per_row, acc::as_cuda_range(alpha_val), + acc::as_cuda_range(a_vals), a->get_const_col_idxs(), stride, + num_stored_elements_per_row, acc::as_cuda_range(b_vals), + as_cuda_type(beta->get_const_values()), + as_cuda_type(c->get_values()), c->get_stride()); + } } else { GKO_KERNEL_NOT_FOUND; } diff --git a/cuda/matrix/fbcsr_kernels.cu b/cuda/matrix/fbcsr_kernels.cu index 6926e5d828f..5f0c2189ae6 100644 --- a/cuda/matrix/fbcsr_kernels.cu +++ b/cuda/matrix/fbcsr_kernels.cu @@ -244,11 +244,13 @@ void convert_to_csr(const std::shared_ptr exec, constexpr auto warps_per_block = default_block_size / config::warp_size; const auto num_blocks = ceildiv(source->get_num_block_rows(), warps_per_block); - kernel::convert_to_csr<<>>( - source->get_const_row_ptrs(), source->get_const_col_idxs(), - as_cuda_type(source->get_const_values()), result->get_row_ptrs(), - result->get_col_idxs(), as_cuda_type(result->get_values()), - source->get_num_block_rows(), source->get_block_size()); + if (num_blocks > 0) { + kernel::convert_to_csr<<>>( + source->get_const_row_ptrs(), source->get_const_col_idxs(), + as_cuda_type(source->get_const_values()), result->get_row_ptrs(), + result->get_col_idxs(), as_cuda_type(result->get_values()), + source->get_num_block_rows(), source->get_block_size()); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -268,8 +270,10 @@ void transpose_blocks_impl(syn::value_list, const size_type numblocks = ceildiv(numthreads, default_block_size); const dim3 block_size{static_cast(default_block_size), 1, 1}; const dim3 grid_dim{static_cast(numblocks), 1, 1}; - kernel::transpose_blocks - <<>>(nbnz, mat->get_values()); + if (grid_dim.x > 0) { + kernel::transpose_blocks + <<>>(nbnz, mat->get_values()); + } } GKO_ENABLE_IMPLEMENTATION_SELECTION(select_transpose_blocks, @@ -325,8 +329,11 @@ void conj_transpose(std::shared_ptr exec, const int grid_size = ceildiv(trans->get_num_stored_elements(), default_block_size); transpose(exec, orig, trans); - csr_reuse::conjugate_kernel<<>>( - trans->get_num_stored_elements(), as_cuda_type(trans->get_values())); + if (grid_size > 0) { + csr_reuse::conjugate_kernel<<>>( + trans->get_num_stored_elements(), + as_cuda_type(trans->get_values())); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -348,9 +355,11 @@ void is_sorted_by_column_index( const auto num_brows = static_cast(to_check->get_num_block_rows()); const auto num_blocks = ceildiv(num_brows, block_size); - csr_reuse::kernel::check_unsorted<<>>( - to_check->get_const_row_ptrs(), to_check->get_const_col_idxs(), - num_brows, gpu_array.get_data()); + if (num_blocks > 0) { + csr_reuse::kernel::check_unsorted<<>>( + to_check->get_const_row_ptrs(), to_check->get_const_col_idxs(), + num_brows, gpu_array.get_data()); + } *is_sorted = exec->copy_val_to_host(gpu_array.get_data()); } diff --git a/cuda/matrix/sellp_kernels.cu b/cuda/matrix/sellp_kernels.cu index 58e2cf96e2f..2bc06a05177 100644 --- a/cuda/matrix/sellp_kernels.cu +++ b/cuda/matrix/sellp_kernels.cu @@ -74,11 +74,13 @@ void spmv(std::shared_ptr exec, const dim3 gridSize(ceildiv(a->get_size()[0], default_block_size), b->get_size()[1]); - spmv_kernel<<>>( - a->get_size()[0], b->get_size()[1], b->get_stride(), c->get_stride(), - a->get_slice_size(), a->get_const_slice_sets(), - as_cuda_type(a->get_const_values()), a->get_const_col_idxs(), - as_cuda_type(b->get_const_values()), as_cuda_type(c->get_values())); + if (gridSize.x * gridSize.y > 0) { + spmv_kernel<<>>( + a->get_size()[0], b->get_size()[1], b->get_stride(), + c->get_stride(), a->get_slice_size(), a->get_const_slice_sets(), + as_cuda_type(a->get_const_values()), a->get_const_col_idxs(), + as_cuda_type(b->get_const_values()), as_cuda_type(c->get_values())); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_SPMV_KERNEL); @@ -96,13 +98,16 @@ void advanced_spmv(std::shared_ptr exec, const dim3 gridSize(ceildiv(a->get_size()[0], default_block_size), b->get_size()[1]); - advanced_spmv_kernel<<>>( - a->get_size()[0], b->get_size()[1], b->get_stride(), c->get_stride(), - a->get_slice_size(), a->get_const_slice_sets(), - as_cuda_type(alpha->get_const_values()), - as_cuda_type(a->get_const_values()), a->get_const_col_idxs(), - as_cuda_type(b->get_const_values()), - as_cuda_type(beta->get_const_values()), as_cuda_type(c->get_values())); + if (gridSize.x * gridSize.y > 0) { + advanced_spmv_kernel<<>>( + a->get_size()[0], b->get_size()[1], b->get_stride(), + c->get_stride(), a->get_slice_size(), a->get_const_slice_sets(), + as_cuda_type(alpha->get_const_values()), + as_cuda_type(a->get_const_values()), a->get_const_col_idxs(), + as_cuda_type(b->get_const_values()), + as_cuda_type(beta->get_const_values()), + as_cuda_type(c->get_values())); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( diff --git a/cuda/multigrid/amgx_pgm_kernels.cu b/cuda/multigrid/amgx_pgm_kernels.cu index fb6f5c4b50d..c0b4a7b3242 100644 --- a/cuda/multigrid/amgx_pgm_kernels.cu +++ b/cuda/multigrid/amgx_pgm_kernels.cu @@ -81,8 +81,10 @@ void match_edge(std::shared_ptr exec, { const auto num = agg.get_num_elems(); const dim3 grid(ceildiv(num, default_block_size)); - kernel::match_edge_kernel<<>>( - num, strongest_neighbor.get_const_data(), agg.get_data()); + if (grid.x > 0) { + kernel::match_edge_kernel<<>>( + num, strongest_neighbor.get_const_data(), agg.get_data()); + } } GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_AMGX_PGM_MATCH_EDGE_KERNEL); @@ -94,9 +96,11 @@ void count_unagg(std::shared_ptr exec, { Array active_agg(exec, agg.get_num_elems()); const dim3 grid(ceildiv(active_agg.get_num_elems(), default_block_size)); - kernel::activate_kernel<<>>( - active_agg.get_num_elems(), agg.get_const_data(), - active_agg.get_data()); + if (grid.x > 0) { + kernel::activate_kernel<<>>( + active_agg.get_num_elems(), agg.get_const_data(), + active_agg.get_data()); + } *num_unagg = reduce_add_array(exec, active_agg.get_num_elems(), active_agg.get_const_data()); } @@ -111,11 +115,15 @@ void renumber(std::shared_ptr exec, Array& agg, const auto num = agg.get_num_elems(); Array agg_map(exec, num + 1); const dim3 grid(ceildiv(num, default_block_size)); - kernel::fill_agg_kernel<<>>( - num, agg.get_const_data(), agg_map.get_data()); + if (grid.x > 0) { + kernel::fill_agg_kernel<<>>( + num, agg.get_const_data(), agg_map.get_data()); + } components::prefix_sum(exec, agg_map.get_data(), agg_map.get_num_elems()); - kernel::renumber_kernel<<>>( - num, agg_map.get_const_data(), agg.get_data()); + if (grid.x > 0) { + kernel::renumber_kernel<<>>( + num, agg_map.get_const_data(), agg.get_data()); + } *num_agg = exec->copy_val_to_host(agg_map.get_const_data() + num); } @@ -131,10 +139,13 @@ void find_strongest_neighbor( { const auto num = agg.get_num_elems(); const dim3 grid(ceildiv(num, default_block_size)); - kernel::find_strongest_neighbor_kernel<<>>( - num, weight_mtx->get_const_row_ptrs(), weight_mtx->get_const_col_idxs(), - weight_mtx->get_const_values(), diag->get_const_values(), - agg.get_data(), strongest_neighbor.get_data()); + if (grid.x > 0) { + kernel::find_strongest_neighbor_kernel<<>>( + num, weight_mtx->get_const_row_ptrs(), + weight_mtx->get_const_col_idxs(), weight_mtx->get_const_values(), + diag->get_const_values(), agg.get_data(), + strongest_neighbor.get_data()); + } } GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE( @@ -151,19 +162,24 @@ void assign_to_exist_agg(std::shared_ptr exec, const dim3 grid(ceildiv(num, default_block_size)); if (intermediate_agg.get_num_elems() > 0) { // determinstic kernel - kernel::assign_to_exist_agg_kernel<<>>( - num, weight_mtx->get_const_row_ptrs(), - weight_mtx->get_const_col_idxs(), weight_mtx->get_const_values(), - diag->get_const_values(), agg.get_const_data(), - intermediate_agg.get_data()); + if (grid.x > 0) { + kernel::assign_to_exist_agg_kernel<<>>( + num, weight_mtx->get_const_row_ptrs(), + weight_mtx->get_const_col_idxs(), + weight_mtx->get_const_values(), diag->get_const_values(), + agg.get_const_data(), intermediate_agg.get_data()); + } // Copy the intermediate_agg to agg agg = intermediate_agg; } else { // undeterminstic kernel - kernel::assign_to_exist_agg_kernel<<>>( - num, weight_mtx->get_const_row_ptrs(), - weight_mtx->get_const_col_idxs(), weight_mtx->get_const_values(), - diag->get_const_values(), agg.get_data()); + if (grid.x > 0) { + kernel::assign_to_exist_agg_kernel<<>>( + num, weight_mtx->get_const_row_ptrs(), + weight_mtx->get_const_col_idxs(), + weight_mtx->get_const_values(), diag->get_const_values(), + agg.get_data()); + } } } diff --git a/cuda/preconditioner/isai_kernels.cu b/cuda/preconditioner/isai_kernels.cu index 9650d37b133..d37e65da2ba 100644 --- a/cuda/preconditioner/isai_kernels.cu +++ b/cuda/preconditioner/isai_kernels.cu @@ -82,24 +82,26 @@ void generate_tri_inverse(std::shared_ptr exec, const dim3 block(default_block_size, 1, 1); const dim3 grid(ceildiv(num_rows, block.x / subwarp_size), 1, 1); - if (lower) { - kernel::generate_l_inverse - <<>>(static_cast(num_rows), - input->get_const_row_ptrs(), - input->get_const_col_idxs(), - as_cuda_type(input->get_const_values()), - inverse->get_row_ptrs(), inverse->get_col_idxs(), - as_cuda_type(inverse->get_values()), - excess_rhs_ptrs, excess_nz_ptrs); - } else { - kernel::generate_u_inverse - <<>>(static_cast(num_rows), - input->get_const_row_ptrs(), - input->get_const_col_idxs(), - as_cuda_type(input->get_const_values()), - inverse->get_row_ptrs(), inverse->get_col_idxs(), - as_cuda_type(inverse->get_values()), - excess_rhs_ptrs, excess_nz_ptrs); + if (grid.x > 0) { + if (lower) { + kernel::generate_l_inverse + <<>>( + static_cast(num_rows), + input->get_const_row_ptrs(), input->get_const_col_idxs(), + as_cuda_type(input->get_const_values()), + inverse->get_row_ptrs(), inverse->get_col_idxs(), + as_cuda_type(inverse->get_values()), excess_rhs_ptrs, + excess_nz_ptrs); + } else { + kernel::generate_u_inverse + <<>>( + static_cast(num_rows), + input->get_const_row_ptrs(), input->get_const_col_idxs(), + as_cuda_type(input->get_const_values()), + inverse->get_row_ptrs(), inverse->get_col_idxs(), + as_cuda_type(inverse->get_values()), excess_rhs_ptrs, + excess_nz_ptrs); + } } components::prefix_sum(exec, excess_rhs_ptrs, num_rows + 1); components::prefix_sum(exec, excess_nz_ptrs, num_rows + 1); @@ -120,13 +122,16 @@ void generate_general_inverse(std::shared_ptr exec, const dim3 block(default_block_size, 1, 1); const dim3 grid(ceildiv(num_rows, block.x / subwarp_size), 1, 1); - kernel::generate_general_inverse - <<>>( - static_cast(num_rows), input->get_const_row_ptrs(), - input->get_const_col_idxs(), - as_cuda_type(input->get_const_values()), inverse->get_row_ptrs(), - inverse->get_col_idxs(), as_cuda_type(inverse->get_values()), - excess_rhs_ptrs, excess_nz_ptrs, spd); + if (grid.x > 0) { + kernel::generate_general_inverse + <<>>(static_cast(num_rows), + input->get_const_row_ptrs(), + input->get_const_col_idxs(), + as_cuda_type(input->get_const_values()), + inverse->get_row_ptrs(), inverse->get_col_idxs(), + as_cuda_type(inverse->get_values()), + excess_rhs_ptrs, excess_nz_ptrs, spd); + } components::prefix_sum(exec, excess_rhs_ptrs, num_rows + 1); components::prefix_sum(exec, excess_nz_ptrs, num_rows + 1); } @@ -149,14 +154,17 @@ void generate_excess_system(std::shared_ptr exec, const dim3 block(default_block_size, 1, 1); const dim3 grid(ceildiv(e_end - e_start, block.x / subwarp_size), 1, 1); - kernel::generate_excess_system<<>>( - static_cast(num_rows), input->get_const_row_ptrs(), - input->get_const_col_idxs(), as_cuda_type(input->get_const_values()), - inverse->get_const_row_ptrs(), inverse->get_const_col_idxs(), - excess_rhs_ptrs, excess_nz_ptrs, excess_system->get_row_ptrs(), - excess_system->get_col_idxs(), - as_cuda_type(excess_system->get_values()), - as_cuda_type(excess_rhs->get_values()), e_start, e_end); + if (grid.x > 0) { + kernel::generate_excess_system<<>>( + static_cast(num_rows), input->get_const_row_ptrs(), + input->get_const_col_idxs(), + as_cuda_type(input->get_const_values()), + inverse->get_const_row_ptrs(), inverse->get_const_col_idxs(), + excess_rhs_ptrs, excess_nz_ptrs, excess_system->get_row_ptrs(), + excess_system->get_col_idxs(), + as_cuda_type(excess_system->get_values()), + as_cuda_type(excess_rhs->get_values()), e_start, e_end); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -171,9 +179,11 @@ void scale_excess_solution(std::shared_ptr, { const dim3 block(default_block_size, 1, 1); const dim3 grid(ceildiv(e_end - e_start, block.x / subwarp_size), 1, 1); - kernel::scale_excess_solution<<>>( - excess_block_ptrs, as_cuda_type(excess_solution->get_values()), e_start, - e_end); + if (grid.x > 0) { + kernel::scale_excess_solution<<>>( + excess_block_ptrs, as_cuda_type(excess_solution->get_values()), + e_start, e_end); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -191,10 +201,12 @@ void scatter_excess_solution(std::shared_ptr exec, const dim3 block(default_block_size, 1, 1); const dim3 grid(ceildiv(e_end - e_start, block.x / subwarp_size), 1, 1); - kernel::copy_excess_solution<<>>( - static_cast(num_rows), inverse->get_const_row_ptrs(), - excess_rhs_ptrs, as_cuda_type(excess_solution->get_const_values()), - as_cuda_type(inverse->get_values()), e_start, e_end); + if (grid.x > 0) { + kernel::copy_excess_solution<<>>( + static_cast(num_rows), inverse->get_const_row_ptrs(), + excess_rhs_ptrs, as_cuda_type(excess_solution->get_const_values()), + as_cuda_type(inverse->get_values()), e_start, e_end); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( diff --git a/cuda/preconditioner/jacobi_advanced_apply_instantiate.inc.cu b/cuda/preconditioner/jacobi_advanced_apply_instantiate.inc.cu index 66c3f673af4..77064e0ec1a 100644 --- a/cuda/preconditioner/jacobi_advanced_apply_instantiate.inc.cu +++ b/cuda/preconditioner/jacobi_advanced_apply_instantiate.inc.cu @@ -89,19 +89,22 @@ void advanced_apply( 1, 1); const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block); - if (block_precisions) { - kernel::advanced_adaptive_apply - <<>>( - as_cuda_type(blocks), storage_scheme, block_precisions, - block_pointers, num_blocks, as_cuda_type(alpha), - as_cuda_type(b), b_stride, as_cuda_type(x), x_stride); - } else { - kernel::advanced_apply - <<>>( - as_cuda_type(blocks), storage_scheme, block_pointers, - num_blocks, as_cuda_type(alpha), as_cuda_type(b), b_stride, - as_cuda_type(x), x_stride); + if (grid_size.x > 0) { + if (block_precisions) { + kernel::advanced_adaptive_apply + <<>>( + as_cuda_type(blocks), storage_scheme, block_precisions, + block_pointers, num_blocks, as_cuda_type(alpha), + as_cuda_type(b), b_stride, as_cuda_type(x), x_stride); + } else { + kernel::advanced_apply + <<>>( + as_cuda_type(blocks), storage_scheme, block_pointers, + num_blocks, as_cuda_type(alpha), as_cuda_type(b), b_stride, + as_cuda_type(x), x_stride); + } } } diff --git a/cuda/preconditioner/jacobi_generate_instantiate.inc.cu b/cuda/preconditioner/jacobi_generate_instantiate.inc.cu index 84f525a7200..7e8ffd17b2c 100644 --- a/cuda/preconditioner/jacobi_generate_instantiate.inc.cu +++ b/cuda/preconditioner/jacobi_generate_instantiate.inc.cu @@ -92,22 +92,26 @@ void generate(syn::value_list, 1, 1); const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block); - if (block_precisions) { - kernel::adaptive_generate - <<>>( - mtx->get_size()[0], mtx->get_const_row_ptrs(), - mtx->get_const_col_idxs(), - as_cuda_type(mtx->get_const_values()), as_cuda_type(accuracy), - as_cuda_type(block_data), storage_scheme, - as_cuda_type(conditioning), block_precisions, block_ptrs, - num_blocks); - } else { - kernel::generate - <<>>( - mtx->get_size()[0], mtx->get_const_row_ptrs(), - mtx->get_const_col_idxs(), - as_cuda_type(mtx->get_const_values()), as_cuda_type(block_data), - storage_scheme, block_ptrs, num_blocks); + if (grid_size.x > 0) { + if (block_precisions) { + kernel::adaptive_generate + <<>>( + mtx->get_size()[0], mtx->get_const_row_ptrs(), + mtx->get_const_col_idxs(), + as_cuda_type(mtx->get_const_values()), + as_cuda_type(accuracy), as_cuda_type(block_data), + storage_scheme, as_cuda_type(conditioning), + block_precisions, block_ptrs, num_blocks); + } else { + kernel::generate + <<>>( + mtx->get_size()[0], mtx->get_const_row_ptrs(), + mtx->get_const_col_idxs(), + as_cuda_type(mtx->get_const_values()), + as_cuda_type(block_data), storage_scheme, block_ptrs, + num_blocks); + } } } diff --git a/cuda/preconditioner/jacobi_kernels.cu b/cuda/preconditioner/jacobi_kernels.cu index 81b0607cf06..a9bb27a7353 100644 --- a/cuda/preconditioner/jacobi_kernels.cu +++ b/cuda/preconditioner/jacobi_kernels.cu @@ -82,9 +82,12 @@ size_type find_natural_blocks(std::shared_ptr exec, const dim3 block_size(config::warp_size, 1, 1); const dim3 grid_size( ceildiv(mtx->get_size()[0] * config::warp_size, block_size.x), 1, 1); - compare_adjacent_rows<<>>( - mtx->get_size()[0], max_block_size, mtx->get_const_row_ptrs(), - mtx->get_const_col_idxs(), matching_next_row.get_data()); + + if (grid_size.x > 0) { + compare_adjacent_rows<<>>( + mtx->get_size()[0], max_block_size, mtx->get_const_row_ptrs(), + mtx->get_const_col_idxs(), matching_next_row.get_data()); + } generate_natural_block_pointer<<<1, 1, 0, 0>>>( mtx->get_size()[0], max_block_size, matching_next_row.get_const_data(), block_ptrs, nums.get_data()); @@ -119,9 +122,11 @@ void initialize_precisions(std::shared_ptr exec, const auto grid_size = min( default_grid_size, static_cast(ceildiv(precisions.get_num_elems(), block_size))); - duplicate_array<<>>( - source.get_const_data(), source.get_num_elems(), precisions.get_data(), - precisions.get_num_elems()); + if (grid_size > 0) { + duplicate_array<<>>( + source.get_const_data(), source.get_num_elems(), + precisions.get_data(), precisions.get_num_elems()); + } } @@ -160,17 +165,19 @@ void transpose_jacobi( 1, 1); const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block); - if (block_precisions) { - adaptive_transpose_jacobi - <<>>( - as_cuda_type(blocks), storage_scheme, block_precisions, - block_pointers, num_blocks, as_cuda_type(out_blocks)); - } else { - transpose_jacobi<<>>( - as_cuda_type(blocks), storage_scheme, block_pointers, num_blocks, - as_cuda_type(out_blocks)); + if (grid_size.x > 0) { + if (block_precisions) { + adaptive_transpose_jacobi + <<>>( + as_cuda_type(blocks), storage_scheme, block_precisions, + block_pointers, num_blocks, as_cuda_type(out_blocks)); + } else { + transpose_jacobi<<>>( + as_cuda_type(blocks), storage_scheme, block_pointers, + num_blocks, as_cuda_type(out_blocks)); + } } } diff --git a/cuda/preconditioner/jacobi_simple_apply_instantiate.inc.cu b/cuda/preconditioner/jacobi_simple_apply_instantiate.inc.cu index a37fcd3a65b..1d1ff5ae0c2 100644 --- a/cuda/preconditioner/jacobi_simple_apply_instantiate.inc.cu +++ b/cuda/preconditioner/jacobi_simple_apply_instantiate.inc.cu @@ -88,18 +88,21 @@ void apply(syn::value_list, size_type num_blocks, 1, 1); const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block); - if (block_precisions) { - kernel::adaptive_apply - <<>>( - as_cuda_type(blocks), storage_scheme, block_precisions, - block_pointers, num_blocks, as_cuda_type(b), b_stride, - as_cuda_type(x), x_stride); - } else { - kernel::apply - <<>>( - as_cuda_type(blocks), storage_scheme, block_pointers, - num_blocks, as_cuda_type(b), b_stride, as_cuda_type(x), - x_stride); + if (grid_size.x > 0) { + if (block_precisions) { + kernel::adaptive_apply + <<>>( + as_cuda_type(blocks), storage_scheme, block_precisions, + block_pointers, num_blocks, as_cuda_type(b), b_stride, + as_cuda_type(x), x_stride); + } else { + kernel::apply + <<>>( + as_cuda_type(blocks), storage_scheme, block_pointers, + num_blocks, as_cuda_type(b), b_stride, as_cuda_type(x), + x_stride); + } } } diff --git a/cuda/solver/multigrid_kernels.cu b/cuda/solver/multigrid_kernels.cu index 1793c264775..01c09d0f7ae 100644 --- a/cuda/solver/multigrid_kernels.cu +++ b/cuda/solver/multigrid_kernels.cu @@ -81,12 +81,14 @@ void kcycle_step_1(std::shared_ptr exec, const size_type grid_nrows = max_size / nrhs < nrows ? max_size / nrhs : nrows; const dim3 grid(ceildiv(grid_nrows * nrhs, default_block_size)); - kernel::kcycle_step_1_kernel<<>>( - nrows, nrhs, e->get_stride(), grid_nrows, - as_cuda_type(alpha->get_const_values()), - as_cuda_type(rho->get_const_values()), - as_cuda_type(v->get_const_values()), as_cuda_type(g->get_values()), - as_cuda_type(d->get_values()), as_cuda_type(e->get_values())); + if (grid.x > 0) { + kernel::kcycle_step_1_kernel<<>>( + nrows, nrhs, e->get_stride(), grid_nrows, + as_cuda_type(alpha->get_const_values()), + as_cuda_type(rho->get_const_values()), + as_cuda_type(v->get_const_values()), as_cuda_type(g->get_values()), + as_cuda_type(d->get_values()), as_cuda_type(e->get_values())); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_MULTIGRID_KCYCLE_STEP_1_KERNEL); @@ -108,14 +110,16 @@ void kcycle_step_2(std::shared_ptr exec, const size_type grid_nrows = max_size / nrhs < nrows ? max_size / nrhs : nrows; const dim3 grid(ceildiv(grid_nrows * nrhs, default_block_size)); - kernel::kcycle_step_2_kernel<<>>( - nrows, nrhs, e->get_stride(), grid_nrows, - as_cuda_type(alpha->get_const_values()), - as_cuda_type(rho->get_const_values()), - as_cuda_type(gamma->get_const_values()), - as_cuda_type(beta->get_const_values()), - as_cuda_type(zeta->get_const_values()), - as_cuda_type(d->get_const_values()), as_cuda_type(e->get_values())); + if (grid.x > 0) { + kernel::kcycle_step_2_kernel<<>>( + nrows, nrhs, e->get_stride(), grid_nrows, + as_cuda_type(alpha->get_const_values()), + as_cuda_type(rho->get_const_values()), + as_cuda_type(gamma->get_const_values()), + as_cuda_type(beta->get_const_values()), + as_cuda_type(zeta->get_const_values()), + as_cuda_type(d->get_const_values()), as_cuda_type(e->get_values())); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_MULTIGRID_KCYCLE_STEP_2_KERNEL); @@ -132,10 +136,12 @@ void kcycle_check_stop(std::shared_ptr exec, true); const auto nrhs = new_norm->get_size()[1]; const dim3 grid(ceildiv(nrhs, default_block_size)); - kernel::kcycle_check_stop_kernel<<>>( - nrhs, as_cuda_type(old_norm->get_const_values()), - as_cuda_type(new_norm->get_const_values()), rel_tol, - as_cuda_type(dis_stop.get_data())); + if (grid.x > 0) { + kernel::kcycle_check_stop_kernel<<>>( + nrhs, as_cuda_type(old_norm->get_const_values()), + as_cuda_type(new_norm->get_const_values()), rel_tol, + as_cuda_type(dis_stop.get_data())); + } is_stop = exec->copy_val_to_host(dis_stop.get_const_data()); } diff --git a/cuda/stop/criterion_kernels.cu b/cuda/stop/criterion_kernels.cu index 25395d9e11b..7c377985109 100644 --- a/cuda/stop/criterion_kernels.cu +++ b/cuda/stop/criterion_kernels.cu @@ -76,9 +76,11 @@ void set_all_statuses(std::shared_ptr exec, const dim3 grid_size(ceildiv(stop_status->get_num_elems(), block_size.x), 1, 1); - set_all_statuses<<>>( - stop_status->get_num_elems(), stoppingId, setFinalized, - as_cuda_type(stop_status->get_data())); + if (grid_size.x > 0) { + set_all_statuses<<>>( + stop_status->get_num_elems(), stoppingId, setFinalized, + as_cuda_type(stop_status->get_data())); + } } diff --git a/cuda/stop/residual_norm_kernels.cu b/cuda/stop/residual_norm_kernels.cu index d4ba96cbbdc..128593dd1b5 100644 --- a/cuda/stop/residual_norm_kernels.cu +++ b/cuda/stop/residual_norm_kernels.cu @@ -104,12 +104,14 @@ void residual_norm(std::shared_ptr exec, const dim3 block_size(default_block_size, 1, 1); const dim3 grid_size(ceildiv(tau->get_size()[1], block_size.x), 1, 1); - residual_norm_kernel<<>>( - tau->get_size()[1], rel_residual_goal, - as_cuda_type(tau->get_const_values()), - as_cuda_type(orig_tau->get_const_values()), stoppingId, setFinalized, - as_cuda_type(stop_status->get_data()), - as_cuda_type(device_storage->get_data())); + if (grid_size.x > 0) { + residual_norm_kernel<<>>( + tau->get_size()[1], rel_residual_goal, + as_cuda_type(tau->get_const_values()), + as_cuda_type(orig_tau->get_const_values()), stoppingId, + setFinalized, as_cuda_type(stop_status->get_data()), + as_cuda_type(device_storage->get_data())); + } /* Represents all_converged, one_changed */ *all_converged = exec->copy_val_to_host(device_storage->get_const_data()); @@ -181,12 +183,14 @@ void implicit_residual_norm( const dim3 block_size(default_block_size, 1, 1); const dim3 grid_size(ceildiv(tau->get_size()[1], block_size.x), 1, 1); - implicit_residual_norm_kernel<<>>( - tau->get_size()[1], rel_residual_goal, - as_cuda_type(tau->get_const_values()), - as_cuda_type(orig_tau->get_const_values()), stoppingId, setFinalized, - as_cuda_type(stop_status->get_data()), - as_cuda_type(device_storage->get_data())); + if (grid_size.x > 0) { + implicit_residual_norm_kernel<<>>( + tau->get_size()[1], rel_residual_goal, + as_cuda_type(tau->get_const_values()), + as_cuda_type(orig_tau->get_const_values()), stoppingId, + setFinalized, as_cuda_type(stop_status->get_data()), + as_cuda_type(device_storage->get_data())); + } /* Represents all_converged, one_changed */ *all_converged = exec->copy_val_to_host(device_storage->get_const_data()); From 17891e580b819b99b2f21f61c0bef63fa8fc08f3 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 9 Dec 2021 14:08:44 +0100 Subject: [PATCH 23/32] improve sorting test output --- cuda/test/components/sorting.cu | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cuda/test/components/sorting.cu b/cuda/test/components/sorting.cu index eaac82a7a99..1392073f1fe 100644 --- a/cuda/test/components/sorting.cu +++ b/cuda/test/components/sorting.cu @@ -125,8 +125,7 @@ TEST_F(Sorting, CudaBitonicSortWarp) auto data_ptr = ddata.get_const_data(); auto ref_ptr = ref_warp.get_const_data(); - ASSERT_TRUE(std::equal(data_ptr, data_ptr + (num_local * config::warp_size), - ref_ptr)); + GKO_ASSERT_ARRAY_EQ(ddata, ref_warp); } @@ -137,7 +136,7 @@ TEST_F(Sorting, CudaBitonicSortShared) auto data_ptr = ddata.get_const_data(); auto ref_ptr = ref_shared.get_const_data(); - ASSERT_TRUE(std::equal(data_ptr, data_ptr + num_elements, ref_ptr)); + GKO_ASSERT_ARRAY_EQ(ddata, ref_shared); } From 66b5ed223221ede5181fd44d493a4a9757eebfeb Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 9 Dec 2021 17:55:57 +0100 Subject: [PATCH 24/32] remove unnecessary dim3 usages --- cuda/factorization/factorization_kernels.cu | 37 +++---- cuda/factorization/par_ilu_kernels.cu | 10 +- cuda/matrix/csr_kernels.cu | 14 +-- cuda/matrix/fbcsr_kernels.cu | 6 +- cuda/matrix/sellp_kernels.cu | 4 +- cuda/multigrid/amgx_pgm_kernels.cu | 24 ++-- cuda/preconditioner/isai_kernels.cu | 30 ++--- .../jacobi_advanced_apply_instantiate.inc.cu | 6 +- .../jacobi_generate_instantiate.inc.cu | 6 +- cuda/preconditioner/jacobi_kernels.cu | 14 +-- .../jacobi_simple_apply_instantiate.inc.cu | 6 +- cuda/solver/cb_gmres_kernels.cu | 40 ++++--- cuda/solver/gmres_kernels.cu | 34 +++--- cuda/solver/multigrid_kernels.cu | 12 +- cuda/stop/criterion_kernels.cu | 7 +- cuda/stop/residual_norm_kernels.cu | 12 +- hip/components/prefix_sum_kernels.hip.cpp | 8 +- hip/components/reduction.hip.hpp | 12 +- .../factorization_kernels.hip.cpp | 53 +++++---- hip/factorization/par_ict_kernels.hip.cpp | 18 +-- hip/factorization/par_ilu_kernels.hip.cpp | 10 +- .../par_ilut_approx_filter_kernel.hip.cpp | 12 +- .../par_ilut_filter_kernel.hip.cpp | 14 +-- .../par_ilut_select_common.hip.cpp | 21 ++-- .../par_ilut_select_kernel.hip.cpp | 13 +-- .../par_ilut_spgeam_kernel.hip.cpp | 14 +-- .../par_ilut_sweep_kernel.hip.cpp | 4 +- hip/matrix/coo_kernels.hip.cpp | 22 ++-- hip/matrix/csr_kernels.hip.cpp | 95 ++++++++-------- hip/matrix/dense_kernels.hip.cpp | 29 +++-- hip/matrix/ell_kernels.hip.cpp | 4 +- hip/matrix/sellp_kernels.hip.cpp | 13 +-- hip/multigrid/amgx_pgm_kernels.hip.cpp | 61 +++++------ hip/preconditioner/isai_kernels.hip.cpp | 20 ++-- ...obi_advanced_apply_instantiate.inc.hip.cpp | 17 ++- .../jacobi_generate_instantiate.inc.hip.cpp | 8 +- hip/preconditioner/jacobi_kernels.hip.cpp | 35 +++--- ...acobi_simple_apply_instantiate.inc.hip.cpp | 16 +-- hip/solver/cb_gmres_kernels.hip.cpp | 80 +++++++------- hip/solver/gmres_kernels.hip.cpp | 103 +++++++++--------- hip/solver/multigrid_kernels.hip.cpp | 32 +++--- hip/stop/criterion_kernels.hip.cpp | 11 +- hip/stop/residual_norm_kernels.hip.cpp | 24 ++-- hip/test/base/hip_executor.hip.cpp | 8 +- hip/test/base/math.hip.cpp | 8 +- .../components/cooperative_groups.hip.cpp | 15 ++- hip/test/components/merging.hip.cpp | 32 +++--- hip/test/components/searching.hip.cpp | 6 +- hip/test/components/sorting.hip.cpp | 8 +- 49 files changed, 519 insertions(+), 569 deletions(-) diff --git a/cuda/factorization/factorization_kernels.cu b/cuda/factorization/factorization_kernels.cu index 02aa605d9bb..202da06749f 100644 --- a/cuda/factorization/factorization_kernels.cu +++ b/cuda/factorization/factorization_kernels.cu @@ -87,10 +87,9 @@ void add_diagonal_elements(std::shared_ptr exec, auto cuda_old_row_ptrs = as_cuda_type(mtx->get_row_ptrs()); auto cuda_row_ptrs_add = as_cuda_type(row_ptrs_addition.get_data()); - const dim3 block_dim{default_block_size, 1, 1}; - const dim3 grid_dim{ - static_cast(ceildiv(num_rows, block_dim.x / subwarp_size)), 1, - 1}; + const auto block_dim = default_block_size; + const auto grid_dim = + static_cast(ceildiv(num_rows, block_dim / subwarp_size)); if (num_rows > 0) { if (is_sorted) { kernel::find_missing_diagonal_elements @@ -131,8 +130,8 @@ void add_diagonal_elements(std::shared_ptr exec, cuda_old_row_ptrs, cuda_new_values, cuda_new_col_idxs, cuda_row_ptrs_add); - const dim3 grid_dim_row_ptrs_update{ - static_cast(ceildiv(num_rows, block_dim.x)), 1, 1}; + const auto grid_dim_row_ptrs_update = + static_cast(ceildiv(num_rows, block_dim)); kernel::update_row_ptrs<<>>( num_rows + 1, cuda_old_row_ptrs, cuda_row_ptrs_add); @@ -153,10 +152,10 @@ void initialize_row_ptrs_l_u( { const size_type num_rows{system_matrix->get_size()[0]}; - const dim3 block_size{default_block_size, 1, 1}; + const auto block_size = default_block_size; const uint32 number_blocks = - ceildiv(num_rows, static_cast(block_size.x)); - const dim3 grid_dim{number_blocks, 1, 1}; + ceildiv(num_rows, static_cast(block_size)); + const auto grid_dim = number_blocks; if (num_rows > 0) { kernel::count_nnz_per_l_u_row<<>>( @@ -181,10 +180,9 @@ void initialize_l_u(std::shared_ptr exec, matrix::Csr* csr_u) { const size_type num_rows{system_matrix->get_size()[0]}; - const dim3 block_size{default_block_size, 1, 1}; - const dim3 grid_dim{static_cast(ceildiv( - num_rows, static_cast(block_size.x))), - 1, 1}; + const auto block_size = default_block_size; + const auto grid_dim = static_cast( + ceildiv(num_rows, static_cast(block_size))); if (num_rows > 0) { kernel::initialize_l_u<<>>( @@ -212,10 +210,10 @@ void initialize_row_ptrs_l( { const size_type num_rows{system_matrix->get_size()[0]}; - const dim3 block_size{default_block_size, 1, 1}; + const auto block_size = default_block_size; const uint32 number_blocks = - ceildiv(num_rows, static_cast(block_size.x)); - const dim3 grid_dim{number_blocks, 1, 1}; + ceildiv(num_rows, static_cast(block_size)); + const auto grid_dim = number_blocks; if (num_rows > 0) { kernel::count_nnz_per_l_row<<>>( @@ -238,10 +236,9 @@ void initialize_l(std::shared_ptr exec, matrix::Csr* csr_l, bool diag_sqrt) { const size_type num_rows{system_matrix->get_size()[0]}; - const dim3 block_size{default_block_size, 1, 1}; - const dim3 grid_dim{static_cast(ceildiv( - num_rows, static_cast(block_size.x))), - 1, 1}; + const auto block_size = default_block_size; + const auto grid_dim = static_cast( + ceildiv(num_rows, static_cast(block_size))); if (num_rows > 0) { kernel::initialize_l<<>>( diff --git a/cuda/factorization/par_ilu_kernels.cu b/cuda/factorization/par_ilu_kernels.cu index 5f5f39d3ce9..09f7a50a901 100644 --- a/cuda/factorization/par_ilu_kernels.cu +++ b/cuda/factorization/par_ilu_kernels.cu @@ -67,13 +67,11 @@ void compute_l_u_factors(std::shared_ptr exec, { iterations = (iterations == 0) ? 10 : iterations; const auto num_elements = system_matrix->get_num_stored_elements(); - const dim3 block_size{default_block_size, 1, 1}; - const dim3 grid_dim{ - static_cast( - ceildiv(num_elements, static_cast(block_size.x))), - 1, 1}; + const auto block_size = default_block_size; + const auto grid_dim = static_cast( + ceildiv(num_elements, static_cast(block_size))); for (size_type i = 0; i < iterations; ++i) { - if (grid_dim.x > 0) { + if (grid_dim > 0) { kernel::compute_l_u_factors<<>>( num_elements, system_matrix->get_const_row_idxs(), system_matrix->get_const_col_idxs(), diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.cu index 00f38a78b7b..e04cac7f51e 100644 --- a/cuda/matrix/csr_kernels.cu +++ b/cuda/matrix/csr_kernels.cu @@ -116,8 +116,8 @@ void merge_path_spmv(syn::value_list, const IndexType total = a->get_size()[0] + a->get_num_stored_elements(); const IndexType grid_num = ceildiv(total, spmv_block_size * items_per_thread); - const dim3 grid(grid_num); - const dim3 block(spmv_block_size); + const auto grid = grid_num; + const auto block = spmv_block_size; Array row_out(exec, grid_num); Array val_out(exec, grid_num); @@ -227,7 +227,7 @@ void classical_spmv(syn::value_list, std::min(ceildiv(a->get_size()[0], spmv_block_size / subwarp_size), int64(nwarps / warps_in_block)); const dim3 grid(gridx, b->get_size()[1]); - const dim3 block(spmv_block_size); + const auto block = spmv_block_size; if (alpha == nullptr && beta == nullptr) { if (grid.x * grid.y > 0) { @@ -953,9 +953,9 @@ void conj_transpose(std::shared_ptr exec, matrix::Csr* trans) { if (cusparse::is_supported::value) { - const dim3 block_size(default_block_size, 1, 1); - const dim3 grid_size( - ceildiv(trans->get_num_stored_elements(), block_size.x), 1, 1); + const auto block_size = default_block_size; + const auto grid_size = + ceildiv(trans->get_num_stored_elements(), block_size); #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC; @@ -991,7 +991,7 @@ void conj_transpose(std::shared_ptr exec, trans->get_row_ptrs(), trans->get_col_idxs(), cu_value, copyValues, idxBase, alg, buffer); #endif - if (grid_size.x > 0) { + if (grid_size > 0) { conjugate_kernel<<>>( trans->get_num_stored_elements(), as_cuda_type(trans->get_values())); diff --git a/cuda/matrix/fbcsr_kernels.cu b/cuda/matrix/fbcsr_kernels.cu index 5f0c2189ae6..a197d7833da 100644 --- a/cuda/matrix/fbcsr_kernels.cu +++ b/cuda/matrix/fbcsr_kernels.cu @@ -268,9 +268,9 @@ void transpose_blocks_impl(syn::value_list, const size_type nbnz = mat->get_num_stored_blocks(); const size_type numthreads = nbnz * subwarp_size; const size_type numblocks = ceildiv(numthreads, default_block_size); - const dim3 block_size{static_cast(default_block_size), 1, 1}; - const dim3 grid_dim{static_cast(numblocks), 1, 1}; - if (grid_dim.x > 0) { + const auto block_size = static_cast(default_block_size); + const auto grid_dim = static_cast(numblocks); + if (grid_dim > 0) { kernel::transpose_blocks <<>>(nbnz, mat->get_values()); } diff --git a/cuda/matrix/sellp_kernels.cu b/cuda/matrix/sellp_kernels.cu index 2bc06a05177..fcf17e9ef1d 100644 --- a/cuda/matrix/sellp_kernels.cu +++ b/cuda/matrix/sellp_kernels.cu @@ -70,7 +70,7 @@ void spmv(std::shared_ptr exec, const matrix::Sellp* a, const matrix::Dense* b, matrix::Dense* c) { - const dim3 blockSize(default_block_size); + const auto blockSize = default_block_size; const dim3 gridSize(ceildiv(a->get_size()[0], default_block_size), b->get_size()[1]); @@ -94,7 +94,7 @@ void advanced_spmv(std::shared_ptr exec, const matrix::Dense* beta, matrix::Dense* c) { - const dim3 blockSize(default_block_size); + const auto blockSize = default_block_size; const dim3 gridSize(ceildiv(a->get_size()[0], default_block_size), b->get_size()[1]); diff --git a/cuda/multigrid/amgx_pgm_kernels.cu b/cuda/multigrid/amgx_pgm_kernels.cu index c0b4a7b3242..a781fb19aaf 100644 --- a/cuda/multigrid/amgx_pgm_kernels.cu +++ b/cuda/multigrid/amgx_pgm_kernels.cu @@ -80,8 +80,8 @@ void match_edge(std::shared_ptr exec, Array& agg) { const auto num = agg.get_num_elems(); - const dim3 grid(ceildiv(num, default_block_size)); - if (grid.x > 0) { + const auto grid = ceildiv(num, default_block_size); + if (grid > 0) { kernel::match_edge_kernel<<>>( num, strongest_neighbor.get_const_data(), agg.get_data()); } @@ -95,8 +95,8 @@ void count_unagg(std::shared_ptr exec, const Array& agg, IndexType* num_unagg) { Array active_agg(exec, agg.get_num_elems()); - const dim3 grid(ceildiv(active_agg.get_num_elems(), default_block_size)); - if (grid.x > 0) { + const auto grid = ceildiv(active_agg.get_num_elems(), default_block_size); + if (grid > 0) { kernel::activate_kernel<<>>( active_agg.get_num_elems(), agg.get_const_data(), active_agg.get_data()); @@ -114,13 +114,13 @@ void renumber(std::shared_ptr exec, Array& agg, { const auto num = agg.get_num_elems(); Array agg_map(exec, num + 1); - const dim3 grid(ceildiv(num, default_block_size)); - if (grid.x > 0) { + const auto grid = ceildiv(num, default_block_size); + if (grid > 0) { kernel::fill_agg_kernel<<>>( num, agg.get_const_data(), agg_map.get_data()); } components::prefix_sum(exec, agg_map.get_data(), agg_map.get_num_elems()); - if (grid.x > 0) { + if (grid > 0) { kernel::renumber_kernel<<>>( num, agg_map.get_const_data(), agg.get_data()); } @@ -138,8 +138,8 @@ void find_strongest_neighbor( Array& strongest_neighbor) { const auto num = agg.get_num_elems(); - const dim3 grid(ceildiv(num, default_block_size)); - if (grid.x > 0) { + const auto grid = ceildiv(num, default_block_size); + if (grid > 0) { kernel::find_strongest_neighbor_kernel<<>>( num, weight_mtx->get_const_row_ptrs(), weight_mtx->get_const_col_idxs(), weight_mtx->get_const_values(), @@ -159,10 +159,10 @@ void assign_to_exist_agg(std::shared_ptr exec, Array& intermediate_agg) { const auto num = agg.get_num_elems(); - const dim3 grid(ceildiv(num, default_block_size)); + const auto grid = ceildiv(num, default_block_size); if (intermediate_agg.get_num_elems() > 0) { // determinstic kernel - if (grid.x > 0) { + if (grid > 0) { kernel::assign_to_exist_agg_kernel<<>>( num, weight_mtx->get_const_row_ptrs(), weight_mtx->get_const_col_idxs(), @@ -173,7 +173,7 @@ void assign_to_exist_agg(std::shared_ptr exec, agg = intermediate_agg; } else { // undeterminstic kernel - if (grid.x > 0) { + if (grid > 0) { kernel::assign_to_exist_agg_kernel<<>>( num, weight_mtx->get_const_row_ptrs(), weight_mtx->get_const_col_idxs(), diff --git a/cuda/preconditioner/isai_kernels.cu b/cuda/preconditioner/isai_kernels.cu index d37e65da2ba..3cc0bebf123 100644 --- a/cuda/preconditioner/isai_kernels.cu +++ b/cuda/preconditioner/isai_kernels.cu @@ -80,9 +80,9 @@ void generate_tri_inverse(std::shared_ptr exec, { const auto num_rows = input->get_size()[0]; - const dim3 block(default_block_size, 1, 1); - const dim3 grid(ceildiv(num_rows, block.x / subwarp_size), 1, 1); - if (grid.x > 0) { + const auto block = default_block_size; + const auto grid = ceildiv(num_rows, block / subwarp_size); + if (grid > 0) { if (lower) { kernel::generate_l_inverse <<>>( @@ -120,9 +120,9 @@ void generate_general_inverse(std::shared_ptr exec, { const auto num_rows = input->get_size()[0]; - const dim3 block(default_block_size, 1, 1); - const dim3 grid(ceildiv(num_rows, block.x / subwarp_size), 1, 1); - if (grid.x > 0) { + const auto block = default_block_size; + const auto grid = ceildiv(num_rows, block / subwarp_size); + if (grid > 0) { kernel::generate_general_inverse <<>>(static_cast(num_rows), input->get_const_row_ptrs(), @@ -152,9 +152,9 @@ void generate_excess_system(std::shared_ptr exec, { const auto num_rows = input->get_size()[0]; - const dim3 block(default_block_size, 1, 1); - const dim3 grid(ceildiv(e_end - e_start, block.x / subwarp_size), 1, 1); - if (grid.x > 0) { + const auto block = default_block_size; + const auto grid = ceildiv(e_end - e_start, block / subwarp_size); + if (grid > 0) { kernel::generate_excess_system<<>>( static_cast(num_rows), input->get_const_row_ptrs(), input->get_const_col_idxs(), @@ -177,9 +177,9 @@ void scale_excess_solution(std::shared_ptr, matrix::Dense* excess_solution, size_type e_start, size_type e_end) { - const dim3 block(default_block_size, 1, 1); - const dim3 grid(ceildiv(e_end - e_start, block.x / subwarp_size), 1, 1); - if (grid.x > 0) { + const auto block = default_block_size; + const auto grid = ceildiv(e_end - e_start, block / subwarp_size); + if (grid > 0) { kernel::scale_excess_solution<<>>( excess_block_ptrs, as_cuda_type(excess_solution->get_values()), e_start, e_end); @@ -199,9 +199,9 @@ void scatter_excess_solution(std::shared_ptr exec, { const auto num_rows = inverse->get_size()[0]; - const dim3 block(default_block_size, 1, 1); - const dim3 grid(ceildiv(e_end - e_start, block.x / subwarp_size), 1, 1); - if (grid.x > 0) { + const auto block = default_block_size; + const auto grid = ceildiv(e_end - e_start, block / subwarp_size); + if (grid > 0) { kernel::copy_excess_solution<<>>( static_cast(num_rows), inverse->get_const_row_ptrs(), excess_rhs_ptrs, as_cuda_type(excess_solution->get_const_values()), diff --git a/cuda/preconditioner/jacobi_advanced_apply_instantiate.inc.cu b/cuda/preconditioner/jacobi_advanced_apply_instantiate.inc.cu index 77064e0ec1a..3917c54bf43 100644 --- a/cuda/preconditioner/jacobi_advanced_apply_instantiate.inc.cu +++ b/cuda/preconditioner/jacobi_advanced_apply_instantiate.inc.cu @@ -85,11 +85,11 @@ void advanced_apply( { constexpr int subwarp_size = get_larger_power(max_block_size); constexpr int blocks_per_warp = config::warp_size / subwarp_size; - const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp), - 1, 1); + const auto grid_size = + ceildiv(num_blocks, warps_per_block * blocks_per_warp); const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block); - if (grid_size.x > 0) { + if (grid_size > 0) { if (block_precisions) { kernel::advanced_adaptive_apply diff --git a/cuda/preconditioner/jacobi_generate_instantiate.inc.cu b/cuda/preconditioner/jacobi_generate_instantiate.inc.cu index 7e8ffd17b2c..240b4f4b5fd 100644 --- a/cuda/preconditioner/jacobi_generate_instantiate.inc.cu +++ b/cuda/preconditioner/jacobi_generate_instantiate.inc.cu @@ -88,11 +88,11 @@ void generate(syn::value_list, { constexpr int subwarp_size = get_larger_power(max_block_size); constexpr int blocks_per_warp = config::warp_size / subwarp_size; - const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp), - 1, 1); + const auto grid_size = + ceildiv(num_blocks, warps_per_block * blocks_per_warp); const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block); - if (grid_size.x > 0) { + if (grid_size > 0) { if (block_precisions) { kernel::adaptive_generate diff --git a/cuda/preconditioner/jacobi_kernels.cu b/cuda/preconditioner/jacobi_kernels.cu index a9bb27a7353..205f4208c82 100644 --- a/cuda/preconditioner/jacobi_kernels.cu +++ b/cuda/preconditioner/jacobi_kernels.cu @@ -79,11 +79,11 @@ size_type find_natural_blocks(std::shared_ptr exec, Array matching_next_row(exec, mtx->get_size()[0] - 1); - const dim3 block_size(config::warp_size, 1, 1); - const dim3 grid_size( - ceildiv(mtx->get_size()[0] * config::warp_size, block_size.x), 1, 1); + const auto block_size = config::warp_size; + const auto grid_size = + ceildiv(mtx->get_size()[0] * config::warp_size, block_size); - if (grid_size.x > 0) { + if (grid_size > 0) { compare_adjacent_rows<<>>( mtx->get_size()[0], max_block_size, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), matching_next_row.get_data()); @@ -161,11 +161,11 @@ void transpose_jacobi( { constexpr int subwarp_size = get_larger_power(max_block_size); constexpr int blocks_per_warp = config::warp_size / subwarp_size; - const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp), - 1, 1); + const auto grid_size = + ceildiv(num_blocks, warps_per_block * blocks_per_warp); const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block); - if (grid_size.x > 0) { + if (grid_size > 0) { if (block_precisions) { adaptive_transpose_jacobi diff --git a/cuda/preconditioner/jacobi_simple_apply_instantiate.inc.cu b/cuda/preconditioner/jacobi_simple_apply_instantiate.inc.cu index 1d1ff5ae0c2..2d21f9d43ac 100644 --- a/cuda/preconditioner/jacobi_simple_apply_instantiate.inc.cu +++ b/cuda/preconditioner/jacobi_simple_apply_instantiate.inc.cu @@ -84,11 +84,11 @@ void apply(syn::value_list, size_type num_blocks, { constexpr int subwarp_size = get_larger_power(max_block_size); constexpr int blocks_per_warp = config::warp_size / subwarp_size; - const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp), - 1, 1); + const auto grid_size = + ceildiv(num_blocks, warps_per_block * blocks_per_warp); const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block); - if (grid_size.x > 0) { + if (grid_size > 0) { if (block_precisions) { kernel::adaptive_apply diff --git a/cuda/solver/cb_gmres_kernels.cu b/cuda/solver/cb_gmres_kernels.cu index 91e4f7db562..f494fa760b8 100644 --- a/cuda/solver/cb_gmres_kernels.cu +++ b/cuda/solver/cb_gmres_kernels.cu @@ -81,8 +81,8 @@ constexpr int default_dot_size = default_dot_dim * default_dot_dim; template void zero_matrix(size_type m, size_type n, size_type stride, ValueType* array) { - const dim3 block_size(default_block_size, 1, 1); - const dim3 grid_size(ceildiv(n, block_size.x), 1, 1); + const auto block_size = default_block_size; + const auto grid_size = ceildiv(n, block_size); zero_matrix_kernel<<>>(m, n, stride, as_cuda_type(array)); } @@ -98,8 +98,8 @@ void initialize_1(std::shared_ptr exec, { const auto num_threads = std::max(b->get_size()[0] * b->get_stride(), krylov_dim * b->get_size()[1]); - const dim3 grid_dim(ceildiv(num_threads, default_block_size), 1, 1); - const dim3 block_dim(default_block_size, 1, 1); + const auto grid_dim = ceildiv(num_threads, default_block_size); + const auto block_dim = default_block_size; constexpr auto block_size = default_block_size; initialize_1_kernel<<>>( @@ -131,9 +131,9 @@ void initialize_2(std::shared_ptr exec, const auto krylov_stride = gko::cb_gmres::helper_functions_accessor::get_stride( krylov_bases); - const dim3 grid_dim_1( - ceildiv((krylov_dim + 1) * krylov_stride[0], default_block_size), 1, 1); - const dim3 block_dim(default_block_size, 1, 1); + const auto grid_dim_1 = + ceildiv((krylov_dim + 1) * krylov_stride[0], default_block_size); + const auto block_dim = default_block_size; constexpr auto block_size = default_block_size; const auto stride_arnoldi = arnoldi_norm->get_stride(); @@ -169,8 +169,8 @@ void initialize_2(std::shared_ptr exec, stride_arnoldi, acc::as_cuda_range(krylov_bases)); } - const dim3 grid_dim_2( - ceildiv(num_rows * krylov_stride[1], default_block_size), 1, 1); + const auto grid_dim_2 = + ceildiv(num_rows * krylov_stride[1], default_block_size); initialize_2_2_kernel<<>>( residual->get_size()[0], residual->get_size()[1], as_cuda_type(residual->get_const_values()), residual->get_stride(), @@ -218,7 +218,7 @@ void finish_arnoldi_CGS(std::shared_ptr exec, // further investigation. const dim3 grid_size_iters_single(exec->get_num_multiprocessor() * 2, iter + 1); - const dim3 block_size_iters_single(singledot_block_size); + const auto block_size_iters_single = singledot_block_size; size_type num_reorth_host; components::fill_array(exec, arnoldi_norm->get_values(), dim_size[1], @@ -372,9 +372,9 @@ void givens_rotation(std::shared_ptr exec, // TODO: tune block_size for optimal performance constexpr auto block_size = default_block_size; const auto num_cols = hessenberg_iter->get_size()[1]; - const dim3 block_dim{block_size, 1, 1}; - const dim3 grid_dim{ - static_cast(ceildiv(num_cols, block_size)), 1, 1}; + const auto block_dim = block_size; + const auto grid_dim = + static_cast(ceildiv(num_cols, block_size)); givens_rotation_kernel<<>>( hessenberg_iter->get_size()[0], hessenberg_iter->get_size()[1], iter, @@ -428,9 +428,9 @@ void solve_upper_triangular( // TODO: tune block_size for optimal performance constexpr auto block_size = default_block_size; const auto num_rhs = residual_norm_collection->get_size()[1]; - const dim3 block_dim{block_size, 1, 1}; - const dim3 grid_dim{static_cast(ceildiv(num_rhs, block_size)), - 1, 1}; + const auto block_dim = block_size; + const auto grid_dim = + static_cast(ceildiv(num_rhs, block_size)); solve_upper_triangular_kernel<<>>( hessenberg->get_size()[1], num_rhs, @@ -454,11 +454,9 @@ void calculate_qy(ConstAccessor3d krylov_bases, size_type num_krylov_bases, before_preconditioner->get_stride(); constexpr auto block_size = default_block_size; - const dim3 grid_dim{ - static_cast( - ceildiv(num_rows * stride_before_preconditioner, block_size)), - 1, 1}; - const dim3 block_dim{block_size, 1, 1}; + const auto grid_dim = static_cast( + ceildiv(num_rows * stride_before_preconditioner, block_size)); + const auto block_dim = block_size; calculate_Qy_kernel<<>>( diff --git a/cuda/solver/gmres_kernels.cu b/cuda/solver/gmres_kernels.cu index c0eb2e20110..5e946868fdf 100644 --- a/cuda/solver/gmres_kernels.cu +++ b/cuda/solver/gmres_kernels.cu @@ -86,8 +86,8 @@ void initialize_1(std::shared_ptr exec, { const auto num_threads = std::max(b->get_size()[0] * b->get_stride(), krylov_dim * b->get_size()[1]); - const dim3 grid_dim(ceildiv(num_threads, default_block_size), 1, 1); - const dim3 block_dim(default_block_size, 1, 1); + const auto grid_dim = ceildiv(num_threads, default_block_size); + const auto block_dim = default_block_size; constexpr auto block_size = default_block_size; initialize_1_kernel<<>>( @@ -112,17 +112,15 @@ void initialize_2(std::shared_ptr exec, { const auto num_rows = residual->get_size()[0]; const auto num_rhs = residual->get_size()[1]; - const dim3 grid_dim_1( + const auto grid_dim_1 = ceildiv(krylov_bases->get_size()[0] * krylov_bases->get_stride(), - default_block_size), - 1, 1); - const dim3 block_dim(default_block_size, 1, 1); + default_block_size); + const auto block_dim = default_block_size; constexpr auto block_size = default_block_size; kernels::cuda::dense::compute_norm2(exec, residual, residual_norm); - const dim3 grid_dim_2(ceildiv(num_rows * num_rhs, default_block_size), 1, - 1); + const auto grid_dim_2 = ceildiv(num_rows * num_rhs, default_block_size); initialize_2_2_kernel<<>>( residual->get_size()[0], residual->get_size()[1], as_cuda_type(residual->get_const_values()), residual->get_stride(), @@ -218,9 +216,9 @@ void givens_rotation(std::shared_ptr exec, // TODO: tune block_size for optimal performance constexpr auto block_size = default_block_size; const auto num_cols = hessenberg_iter->get_size()[1]; - const dim3 block_dim{block_size, 1, 1}; - const dim3 grid_dim{ - static_cast(ceildiv(num_cols, block_size)), 1, 1}; + const auto block_dim = block_size; + const auto grid_dim = + static_cast(ceildiv(num_cols, block_size)); givens_rotation_kernel<<>>( hessenberg_iter->get_size()[0], hessenberg_iter->get_size()[1], iter, @@ -269,9 +267,9 @@ void solve_upper_triangular( // TODO: tune block_size for optimal performance constexpr auto block_size = default_block_size; const auto num_rhs = residual_norm_collection->get_size()[1]; - const dim3 block_dim{block_size, 1, 1}; - const dim3 grid_dim{static_cast(ceildiv(num_rhs, block_size)), - 1, 1}; + const auto block_dim = block_size; + const auto grid_dim = + static_cast(ceildiv(num_rhs, block_size)); solve_upper_triangular_kernel<<>>( hessenberg->get_size()[1], num_rhs, @@ -296,11 +294,9 @@ void calculate_qy(const matrix::Dense* krylov_bases, before_preconditioner->get_stride(); constexpr auto block_size = default_block_size; - const dim3 grid_dim{ - static_cast( - ceildiv(num_rows * stride_before_preconditioner, block_size)), - 1, 1}; - const dim3 block_dim{block_size, 1, 1}; + const auto grid_dim = static_cast( + ceildiv(num_rows * stride_before_preconditioner, block_size)); + const auto block_dim = block_size; calculate_Qy_kernel<<>>( diff --git a/cuda/solver/multigrid_kernels.cu b/cuda/solver/multigrid_kernels.cu index 01c09d0f7ae..27f1b5f37e9 100644 --- a/cuda/solver/multigrid_kernels.cu +++ b/cuda/solver/multigrid_kernels.cu @@ -80,8 +80,8 @@ void kcycle_step_1(std::shared_ptr exec, constexpr int max_size = (1U << 31) - 1; const size_type grid_nrows = max_size / nrhs < nrows ? max_size / nrhs : nrows; - const dim3 grid(ceildiv(grid_nrows * nrhs, default_block_size)); - if (grid.x > 0) { + const auto grid = ceildiv(grid_nrows * nrhs, default_block_size); + if (grid > 0) { kernel::kcycle_step_1_kernel<<>>( nrows, nrhs, e->get_stride(), grid_nrows, as_cuda_type(alpha->get_const_values()), @@ -109,8 +109,8 @@ void kcycle_step_2(std::shared_ptr exec, constexpr int max_size = (1U << 31) - 1; const size_type grid_nrows = max_size / nrhs < nrows ? max_size / nrhs : nrows; - const dim3 grid(ceildiv(grid_nrows * nrhs, default_block_size)); - if (grid.x > 0) { + const auto grid = ceildiv(grid_nrows * nrhs, default_block_size); + if (grid > 0) { kernel::kcycle_step_2_kernel<<>>( nrows, nrhs, e->get_stride(), grid_nrows, as_cuda_type(alpha->get_const_values()), @@ -135,8 +135,8 @@ void kcycle_check_stop(std::shared_ptr exec, components::fill_array(exec, dis_stop.get_data(), dis_stop.get_num_elems(), true); const auto nrhs = new_norm->get_size()[1]; - const dim3 grid(ceildiv(nrhs, default_block_size)); - if (grid.x > 0) { + const auto grid = ceildiv(nrhs, default_block_size); + if (grid > 0) { kernel::kcycle_check_stop_kernel<<>>( nrhs, as_cuda_type(old_norm->get_const_values()), as_cuda_type(new_norm->get_const_values()), rel_tol, diff --git a/cuda/stop/criterion_kernels.cu b/cuda/stop/criterion_kernels.cu index 7c377985109..aecc7871fd4 100644 --- a/cuda/stop/criterion_kernels.cu +++ b/cuda/stop/criterion_kernels.cu @@ -72,11 +72,10 @@ void set_all_statuses(std::shared_ptr exec, uint8 stoppingId, bool setFinalized, Array* stop_status) { - const dim3 block_size(default_block_size, 1, 1); - const dim3 grid_size(ceildiv(stop_status->get_num_elems(), block_size.x), 1, - 1); + const auto block_size = default_block_size; + const auto grid_size = ceildiv(stop_status->get_num_elems(), block_size); - if (grid_size.x > 0) { + if (grid_size > 0) { set_all_statuses<<>>( stop_status->get_num_elems(), stoppingId, setFinalized, as_cuda_type(stop_status->get_data())); diff --git a/cuda/stop/residual_norm_kernels.cu b/cuda/stop/residual_norm_kernels.cu index 128593dd1b5..75d62cdedc9 100644 --- a/cuda/stop/residual_norm_kernels.cu +++ b/cuda/stop/residual_norm_kernels.cu @@ -101,10 +101,10 @@ void residual_norm(std::shared_ptr exec, "ValueType must not be complex in this function!"); init_kernel<<<1, 1>>>(as_cuda_type(device_storage->get_data())); - const dim3 block_size(default_block_size, 1, 1); - const dim3 grid_size(ceildiv(tau->get_size()[1], block_size.x), 1, 1); + const auto block_size = default_block_size; + const auto grid_size = ceildiv(tau->get_size()[1], block_size); - if (grid_size.x > 0) { + if (grid_size > 0) { residual_norm_kernel<<>>( tau->get_size()[1], rel_residual_goal, as_cuda_type(tau->get_const_values()), @@ -180,10 +180,10 @@ void implicit_residual_norm( { init_kernel<<<1, 1>>>(as_cuda_type(device_storage->get_data())); - const dim3 block_size(default_block_size, 1, 1); - const dim3 grid_size(ceildiv(tau->get_size()[1], block_size.x), 1, 1); + const auto block_size = default_block_size; + const auto grid_size = ceildiv(tau->get_size()[1], block_size); - if (grid_size.x > 0) { + if (grid_size > 0) { implicit_residual_norm_kernel<<>>( tau->get_size()[1], rel_residual_goal, as_cuda_type(tau->get_const_values()), diff --git a/hip/components/prefix_sum_kernels.hip.cpp b/hip/components/prefix_sum_kernels.hip.cpp index 1e472a3aa5d..a094d1b479a 100644 --- a/hip/components/prefix_sum_kernels.hip.cpp +++ b/hip/components/prefix_sum_kernels.hip.cpp @@ -56,15 +56,15 @@ void prefix_sum(std::shared_ptr exec, IndexType* counts, auto block_sums = block_sum_array.get_data(); hipLaunchKernelGGL( HIP_KERNEL_NAME(start_prefix_sum), - dim3(num_blocks), dim3(prefix_sum_block_size), 0, 0, num_entries, - counts, block_sums); + num_blocks, prefix_sum_block_size, 0, 0, num_entries, counts, + block_sums); // add the total sum of the previous block only when the number of // blocks is larger than 1. if (num_blocks > 1) { hipLaunchKernelGGL( HIP_KERNEL_NAME(finalize_prefix_sum), - dim3(num_blocks), dim3(prefix_sum_block_size), 0, 0, - num_entries, counts, block_sums); + num_blocks, prefix_sum_block_size, 0, 0, num_entries, counts, + block_sums); } } } diff --git a/hip/components/reduction.hip.hpp b/hip/components/reduction.hip.hpp index 15d690329be..27910cd4432 100644 --- a/hip/components/reduction.hip.hpp +++ b/hip/components/reduction.hip.hpp @@ -84,19 +84,17 @@ __host__ ValueType reduce_add_array(std::shared_ptr exec, block_results.resize_and_reset(grid_dim); - hipLaunchKernelGGL(reduce_add_array, dim3(grid_dim), - dim3(default_reduce_block_size), 0, 0, size, - as_hip_type(source), - as_hip_type(block_results.get_data())); + hipLaunchKernelGGL( + reduce_add_array, grid_dim, default_reduce_block_size, 0, 0, size, + as_hip_type(source), as_hip_type(block_results.get_data())); block_results_val = block_results.get_const_data(); } auto d_result = Array(exec, 1); - hipLaunchKernelGGL(reduce_add_array, dim3(1), - dim3(default_reduce_block_size), 0, 0, grid_dim, - as_hip_type(block_results_val), + hipLaunchKernelGGL(reduce_add_array, 1, default_reduce_block_size, 0, 0, + grid_dim, as_hip_type(block_results_val), as_hip_type(d_result.get_data())); auto answer = exec->copy_val_to_host(d_result.get_const_data()); return answer; diff --git a/hip/factorization/factorization_kernels.hip.cpp b/hip/factorization/factorization_kernels.hip.cpp index 03b8deec196..d0d36f3e955 100644 --- a/hip/factorization/factorization_kernels.hip.cpp +++ b/hip/factorization/factorization_kernels.hip.cpp @@ -89,10 +89,9 @@ void add_diagonal_elements(std::shared_ptr exec, auto hip_old_row_ptrs = as_hip_type(mtx->get_row_ptrs()); auto hip_row_ptrs_add = as_hip_type(row_ptrs_addition.get_data()); - const dim3 block_dim{default_block_size, 1, 1}; - const dim3 grid_dim{ - static_cast(ceildiv(num_rows, block_dim.x / subwarp_size)), 1, - 1}; + const auto block_dim = default_block_size; + const auto grid_dim = + static_cast(ceildiv(num_rows, block_dim / subwarp_size)); if (is_sorted) { hipLaunchKernelGGL( HIP_KERNEL_NAME( @@ -132,8 +131,8 @@ void add_diagonal_elements(std::shared_ptr exec, grid_dim, block_dim, 0, 0, num_rows, hip_old_values, hip_old_col_idxs, hip_old_row_ptrs, hip_new_values, hip_new_col_idxs, hip_row_ptrs_add); - const dim3 grid_dim_row_ptrs_update{ - static_cast(ceildiv(num_rows, block_dim.x)), 1, 1}; + const auto grid_dim_row_ptrs_update = + static_cast(ceildiv(num_rows, block_dim)); hipLaunchKernelGGL(kernel::update_row_ptrs, grid_dim_row_ptrs_update, block_dim, 0, 0, num_rows + 1, hip_old_row_ptrs, hip_row_ptrs_add); @@ -155,13 +154,13 @@ void initialize_row_ptrs_l_u( { const size_type num_rows{system_matrix->get_size()[0]}; - const dim3 block_size{default_block_size, 1, 1}; + const auto block_size = default_block_size; const uint32 number_blocks = - ceildiv(num_rows, static_cast(block_size.x)); - const dim3 grid_dim{number_blocks, 1, 1}; + ceildiv(num_rows, static_cast(block_size)); + const auto grid_dim = number_blocks; - hipLaunchKernelGGL(kernel::count_nnz_per_l_u_row, dim3(grid_dim), - dim3(block_size), 0, 0, num_rows, + hipLaunchKernelGGL(kernel::count_nnz_per_l_u_row, grid_dim, block_size, 0, + 0, num_rows, as_hip_type(system_matrix->get_const_row_ptrs()), as_hip_type(system_matrix->get_const_col_idxs()), as_hip_type(system_matrix->get_const_values()), @@ -182,14 +181,13 @@ void initialize_l_u(std::shared_ptr exec, matrix::Csr* csr_u) { const size_type num_rows{system_matrix->get_size()[0]}; - const dim3 block_size{default_block_size, 1, 1}; - const dim3 grid_dim{static_cast(ceildiv( - num_rows, static_cast(block_size.x))), - 1, 1}; + const auto block_size = default_block_size; + const auto grid_dim = static_cast( + ceildiv(num_rows, static_cast(block_size))); hipLaunchKernelGGL( - kernel::initialize_l_u, dim3(grid_dim), dim3(block_size), 0, 0, - num_rows, as_hip_type(system_matrix->get_const_row_ptrs()), + kernel::initialize_l_u, grid_dim, block_size, 0, 0, num_rows, + as_hip_type(system_matrix->get_const_row_ptrs()), as_hip_type(system_matrix->get_const_col_idxs()), as_hip_type(system_matrix->get_const_values()), as_hip_type(csr_l->get_const_row_ptrs()), @@ -210,13 +208,13 @@ void initialize_row_ptrs_l( { const size_type num_rows{system_matrix->get_size()[0]}; - const dim3 block_size{default_block_size, 1, 1}; + const auto block_size = default_block_size; const uint32 number_blocks = - ceildiv(num_rows, static_cast(block_size.x)); - const dim3 grid_dim{number_blocks, 1, 1}; + ceildiv(num_rows, static_cast(block_size)); + const auto grid_dim = number_blocks; - hipLaunchKernelGGL(kernel::count_nnz_per_l_row, dim3(grid_dim), - dim3(block_size), 0, 0, num_rows, + hipLaunchKernelGGL(kernel::count_nnz_per_l_row, grid_dim, block_size, 0, 0, + num_rows, as_hip_type(system_matrix->get_const_row_ptrs()), as_hip_type(system_matrix->get_const_col_idxs()), as_hip_type(system_matrix->get_const_values()), @@ -235,13 +233,12 @@ void initialize_l(std::shared_ptr exec, matrix::Csr* csr_l, bool diag_sqrt) { const size_type num_rows{system_matrix->get_size()[0]}; - const dim3 block_size{default_block_size, 1, 1}; - const dim3 grid_dim{static_cast(ceildiv( - num_rows, static_cast(block_size.x))), - 1, 1}; + const auto block_size = default_block_size; + const auto grid_dim = static_cast( + ceildiv(num_rows, static_cast(block_size))); - hipLaunchKernelGGL(kernel::initialize_l, dim3(grid_dim), dim3(block_size), - 0, 0, num_rows, + hipLaunchKernelGGL(kernel::initialize_l, grid_dim, block_size, 0, 0, + num_rows, as_hip_type(system_matrix->get_const_row_ptrs()), as_hip_type(system_matrix->get_const_col_idxs()), as_hip_type(system_matrix->get_const_values()), diff --git a/hip/factorization/par_ict_kernels.hip.cpp b/hip/factorization/par_ict_kernels.hip.cpp index 26f8c15193d..edd04ab93f6 100644 --- a/hip/factorization/par_ict_kernels.hip.cpp +++ b/hip/factorization/par_ict_kernels.hip.cpp @@ -107,9 +107,9 @@ void add_candidates(syn::value_list, auto l_new_row_ptrs = l_new->get_row_ptrs(); // count non-zeros per row hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel::ict_tri_spgeam_nnz), - dim3(num_blocks), dim3(default_block_size), 0, 0, llh_row_ptrs, - llh_col_idxs, a_row_ptrs, a_col_idxs, l_new_row_ptrs, num_rows); + HIP_KERNEL_NAME(kernel::ict_tri_spgeam_nnz), num_blocks, + default_block_size, 0, 0, llh_row_ptrs, llh_col_idxs, a_row_ptrs, + a_col_idxs, l_new_row_ptrs, num_rows); // build row ptrs components::prefix_sum(exec, l_new_row_ptrs, num_rows + 1); @@ -124,11 +124,11 @@ void add_candidates(syn::value_list, // fill columns and values hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel::ict_tri_spgeam_init), - dim3(num_blocks), dim3(default_block_size), 0, 0, llh_row_ptrs, - llh_col_idxs, as_hip_type(llh_vals), a_row_ptrs, a_col_idxs, - as_hip_type(a_vals), l_row_ptrs, l_col_idxs, as_hip_type(l_vals), - l_new_row_ptrs, l_new_col_idxs, as_hip_type(l_new_vals), num_rows); + HIP_KERNEL_NAME(kernel::ict_tri_spgeam_init), num_blocks, + default_block_size, 0, 0, llh_row_ptrs, llh_col_idxs, + as_hip_type(llh_vals), a_row_ptrs, a_col_idxs, as_hip_type(a_vals), + l_row_ptrs, l_col_idxs, as_hip_type(l_vals), l_new_row_ptrs, + l_new_col_idxs, as_hip_type(l_new_vals), num_rows); } @@ -146,7 +146,7 @@ void compute_factor(syn::value_list, auto block_size = default_block_size / subwarp_size; auto num_blocks = ceildiv(total_nnz, block_size); hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::ict_sweep), - dim3(num_blocks), dim3(default_block_size), 0, 0, + num_blocks, default_block_size, 0, 0, a->get_const_row_ptrs(), a->get_const_col_idxs(), as_hip_type(a->get_const_values()), l->get_const_row_ptrs(), l_coo->get_const_row_idxs(), diff --git a/hip/factorization/par_ilu_kernels.hip.cpp b/hip/factorization/par_ilu_kernels.hip.cpp index dc4317be212..c29a554078c 100644 --- a/hip/factorization/par_ilu_kernels.hip.cpp +++ b/hip/factorization/par_ilu_kernels.hip.cpp @@ -71,14 +71,12 @@ void compute_l_u_factors(std::shared_ptr exec, { iterations = (iterations == 0) ? 10 : iterations; const auto num_elements = system_matrix->get_num_stored_elements(); - const dim3 block_size{default_block_size, 1, 1}; - const dim3 grid_dim{ - static_cast( - ceildiv(num_elements, static_cast(block_size.x))), - 1, 1}; + const auto block_size = default_block_size; + const auto grid_dim = static_cast( + ceildiv(num_elements, static_cast(block_size))); for (size_type i = 0; i < iterations; ++i) { hipLaunchKernelGGL( - kernel::compute_l_u_factors, dim3(grid_dim), dim3(block_size), 0, 0, + kernel::compute_l_u_factors, grid_dim, block_size, 0, 0, num_elements, system_matrix->get_const_row_idxs(), system_matrix->get_const_col_idxs(), as_hip_type(system_matrix->get_const_values()), diff --git a/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp b/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp index d16d224516a..ce6fba1b7f3 100644 --- a/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp +++ b/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp @@ -143,8 +143,8 @@ void threshold_filter_approx(syn::value_list, auto num_blocks = ceildiv(num_rows, block_size); auto new_row_ptrs = m_out->get_row_ptrs(); hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::bucket_filter_nnz), - dim3(num_blocks), dim3(default_block_size), 0, 0, - old_row_ptrs, oracles, num_rows, bucket, new_row_ptrs); + num_blocks, default_block_size, 0, 0, old_row_ptrs, + oracles, num_rows, bucket, new_row_ptrs); // build row pointers components::prefix_sum(exec, new_row_ptrs, num_rows + 1); @@ -168,10 +168,10 @@ void threshold_filter_approx(syn::value_list, new_row_idxs = m_out_coo->get_row_idxs(); } hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::bucket_filter), - dim3(num_blocks), dim3(default_block_size), 0, 0, - old_row_ptrs, old_col_idxs, as_hip_type(old_vals), - oracles, num_rows, bucket, new_row_ptrs, new_row_idxs, - new_col_idxs, as_hip_type(new_vals)); + num_blocks, default_block_size, 0, 0, old_row_ptrs, + old_col_idxs, as_hip_type(old_vals), oracles, num_rows, + bucket, new_row_ptrs, new_row_idxs, new_col_idxs, + as_hip_type(new_vals)); } diff --git a/hip/factorization/par_ilut_filter_kernel.hip.cpp b/hip/factorization/par_ilut_filter_kernel.hip.cpp index 0e1b5afa78b..c7845611700 100644 --- a/hip/factorization/par_ilut_filter_kernel.hip.cpp +++ b/hip/factorization/par_ilut_filter_kernel.hip.cpp @@ -98,9 +98,9 @@ void threshold_filter(syn::value_list, auto num_blocks = ceildiv(num_rows, block_size); auto new_row_ptrs = m_out->get_row_ptrs(); hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel::threshold_filter_nnz), - dim3(num_blocks), dim3(default_block_size), 0, 0, old_row_ptrs, - as_hip_type(old_vals), num_rows, threshold, new_row_ptrs, lower); + HIP_KERNEL_NAME(kernel::threshold_filter_nnz), num_blocks, + default_block_size, 0, 0, old_row_ptrs, as_hip_type(old_vals), num_rows, + threshold, new_row_ptrs, lower); // build row pointers components::prefix_sum(exec, new_row_ptrs, num_rows + 1); @@ -124,10 +124,10 @@ void threshold_filter(syn::value_list, new_row_idxs = m_out_coo->get_row_idxs(); } hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::threshold_filter), - dim3(num_blocks), dim3(default_block_size), 0, 0, - old_row_ptrs, old_col_idxs, as_hip_type(old_vals), - num_rows, threshold, new_row_ptrs, new_row_idxs, - new_col_idxs, as_hip_type(new_vals), lower); + num_blocks, default_block_size, 0, 0, old_row_ptrs, + old_col_idxs, as_hip_type(old_vals), num_rows, threshold, + new_row_ptrs, new_row_idxs, new_col_idxs, + as_hip_type(new_vals), lower); } diff --git a/hip/factorization/par_ilut_select_common.hip.cpp b/hip/factorization/par_ilut_select_common.hip.cpp index 7dd3b827e4a..3e6ae96361a 100644 --- a/hip/factorization/par_ilut_select_common.hip.cpp +++ b/hip/factorization/par_ilut_select_common.hip.cpp @@ -75,17 +75,16 @@ void sampleselect_count(std::shared_ptr exec, auto num_blocks = static_cast(ceildiv(num_threads_total, default_block_size)); // pick sample, build searchtree - hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::build_searchtree), dim3(1), - dim3(bucket_count), 0, 0, as_hip_type(values), size, - tree); + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::build_searchtree), 1, + bucket_count, 0, 0, as_hip_type(values), size, tree); // determine bucket sizes - hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::count_buckets), dim3(num_blocks), - dim3(default_block_size), 0, 0, as_hip_type(values), - size, tree, partial_counts, oracles, items_per_thread); + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::count_buckets), num_blocks, + default_block_size, 0, 0, as_hip_type(values), size, + tree, partial_counts, oracles, items_per_thread); // compute prefix sum and total sum over block-local values - hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::block_prefix_sum), - dim3(bucket_count), dim3(default_block_size), 0, 0, - partial_counts, total_counts, num_blocks); + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::block_prefix_sum), bucket_count, + default_block_size, 0, 0, partial_counts, total_counts, + num_blocks); // compute prefix sum over bucket counts components::prefix_sum(exec, total_counts, bucket_count + 1); } @@ -106,8 +105,8 @@ sampleselect_bucket sampleselect_find_bucket( std::shared_ptr exec, IndexType* prefix_sum, IndexType rank) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::find_bucket), dim3(1), - dim3(config::warp_size), 0, 0, prefix_sum, rank); + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::find_bucket), 1, + config::warp_size, 0, 0, prefix_sum, rank); IndexType values[3]{}; exec->get_master()->copy_from(exec.get(), 3, prefix_sum, values); return {values[0], values[1], values[2]}; diff --git a/hip/factorization/par_ilut_select_kernel.hip.cpp b/hip/factorization/par_ilut_select_kernel.hip.cpp index 4703fa9bfed..80d8c37ee60 100644 --- a/hip/factorization/par_ilut_select_kernel.hip.cpp +++ b/hip/factorization/par_ilut_select_kernel.hip.cpp @@ -78,10 +78,9 @@ void sampleselect_filter(const ValueType* values, IndexType size, auto num_threads_total = ceildiv(size, items_per_thread); auto num_blocks = static_cast(ceildiv(num_threads_total, default_block_size)); - hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::filter_bucket), dim3(num_blocks), - dim3(default_block_size), 0, 0, as_hip_type(values), - size, bucket, oracles, partial_counts, out, - items_per_thread); + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::filter_bucket), num_blocks, + default_block_size, 0, 0, as_hip_type(values), size, + bucket, oracles, partial_counts, out, items_per_thread); } @@ -173,9 +172,9 @@ void threshold_select(std::shared_ptr exec, // base case auto out_ptr = reinterpret_cast(tmp1.get_data()); - hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::basecase_select), dim3(1), - dim3(kernel::basecase_block_size), 0, 0, tmp22, - bucket.size, rank, out_ptr); + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::basecase_select), 1, + kernel::basecase_block_size, 0, 0, tmp22, bucket.size, + rank, out_ptr); threshold = exec->copy_val_to_host(out_ptr); } diff --git a/hip/factorization/par_ilut_spgeam_kernel.hip.cpp b/hip/factorization/par_ilut_spgeam_kernel.hip.cpp index 6901287fbe3..7dc4f902eea 100644 --- a/hip/factorization/par_ilut_spgeam_kernel.hip.cpp +++ b/hip/factorization/par_ilut_spgeam_kernel.hip.cpp @@ -113,9 +113,9 @@ void add_candidates(syn::value_list, auto u_new_row_ptrs = u_new->get_row_ptrs(); // count non-zeros per row hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::tri_spgeam_nnz), - dim3(num_blocks), dim3(default_block_size), 0, 0, - lu_row_ptrs, lu_col_idxs, a_row_ptrs, a_col_idxs, - l_new_row_ptrs, u_new_row_ptrs, num_rows); + num_blocks, default_block_size, 0, 0, lu_row_ptrs, + lu_col_idxs, a_row_ptrs, a_col_idxs, l_new_row_ptrs, + u_new_row_ptrs, num_rows); // build row ptrs components::prefix_sum(exec, l_new_row_ptrs, num_rows + 1); @@ -136,10 +136,10 @@ void add_candidates(syn::value_list, // fill columns and values hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::tri_spgeam_init), - dim3(num_blocks), dim3(default_block_size), 0, 0, - lu_row_ptrs, lu_col_idxs, as_hip_type(lu_vals), - a_row_ptrs, a_col_idxs, as_hip_type(a_vals), l_row_ptrs, - l_col_idxs, as_hip_type(l_vals), u_row_ptrs, u_col_idxs, + num_blocks, default_block_size, 0, 0, lu_row_ptrs, + lu_col_idxs, as_hip_type(lu_vals), a_row_ptrs, + a_col_idxs, as_hip_type(a_vals), l_row_ptrs, l_col_idxs, + as_hip_type(l_vals), u_row_ptrs, u_col_idxs, as_hip_type(u_vals), l_new_row_ptrs, l_new_col_idxs, as_hip_type(l_new_vals), u_new_row_ptrs, u_new_col_idxs, as_hip_type(u_new_vals), num_rows); diff --git a/hip/factorization/par_ilut_sweep_kernel.hip.cpp b/hip/factorization/par_ilut_sweep_kernel.hip.cpp index 2652cd8fcbc..9463115cda4 100644 --- a/hip/factorization/par_ilut_sweep_kernel.hip.cpp +++ b/hip/factorization/par_ilut_sweep_kernel.hip.cpp @@ -97,8 +97,8 @@ void compute_l_u_factors(syn::value_list, auto block_size = default_block_size / subwarp_size; auto num_blocks = ceildiv(total_nnz, block_size); hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel::sweep), dim3(num_blocks), - dim3(default_block_size), 0, 0, a->get_const_row_ptrs(), + HIP_KERNEL_NAME(kernel::sweep), num_blocks, + default_block_size, 0, 0, a->get_const_row_ptrs(), a->get_const_col_idxs(), as_hip_type(a->get_const_values()), l->get_const_row_ptrs(), l_coo->get_const_row_idxs(), l->get_const_col_idxs(), as_hip_type(l->get_values()), diff --git a/hip/matrix/coo_kernels.hip.cpp b/hip/matrix/coo_kernels.hip.cpp index 3e25f713397..36b053e25f6 100644 --- a/hip/matrix/coo_kernels.hip.cpp +++ b/hip/matrix/coo_kernels.hip.cpp @@ -123,9 +123,9 @@ void spmv2(std::shared_ptr exec, const dim3 coo_grid(ceildiv(nwarps, warps_in_block), b_ncols); int num_lines = ceildiv(nnz, nwarps * config::warp_size); hipLaunchKernelGGL( - abstract_spmv, dim3(coo_grid), dim3(coo_block), 0, 0, nnz, - num_lines, as_hip_type(a->get_const_values()), - a->get_const_col_idxs(), as_hip_type(a->get_const_row_idxs()), + abstract_spmv, coo_grid, coo_block, 0, 0, nnz, num_lines, + as_hip_type(a->get_const_values()), a->get_const_col_idxs(), + as_hip_type(a->get_const_row_idxs()), as_hip_type(b->get_const_values()), b->get_stride(), as_hip_type(c->get_values()), c->get_stride()); } else { @@ -134,10 +134,10 @@ void spmv2(std::shared_ptr exec, const dim3 coo_grid(ceildiv(nwarps, warps_in_block), ceildiv(b_ncols, config::warp_size)); hipLaunchKernelGGL( - abstract_spmm, dim3(coo_grid), dim3(coo_block), 0, 0, nnz, - num_elems, as_hip_type(a->get_const_values()), - a->get_const_col_idxs(), as_hip_type(a->get_const_row_idxs()), - b_ncols, as_hip_type(b->get_const_values()), b->get_stride(), + abstract_spmm, coo_grid, coo_block, 0, 0, nnz, num_elems, + as_hip_type(a->get_const_values()), a->get_const_col_idxs(), + as_hip_type(a->get_const_row_idxs()), b_ncols, + as_hip_type(b->get_const_values()), b->get_stride(), as_hip_type(c->get_values()), c->get_stride()); } } @@ -164,8 +164,8 @@ void advanced_spmv2(std::shared_ptr exec, int num_lines = ceildiv(nnz, nwarps * config::warp_size); const dim3 coo_grid(ceildiv(nwarps, warps_in_block), b_ncols); hipLaunchKernelGGL( - abstract_spmv, dim3(coo_grid), dim3(coo_block), 0, 0, nnz, - num_lines, as_hip_type(alpha->get_const_values()), + abstract_spmv, coo_grid, coo_block, 0, 0, nnz, num_lines, + as_hip_type(alpha->get_const_values()), as_hip_type(a->get_const_values()), a->get_const_col_idxs(), as_hip_type(a->get_const_row_idxs()), as_hip_type(b->get_const_values()), b->get_stride(), @@ -176,8 +176,8 @@ void advanced_spmv2(std::shared_ptr exec, const dim3 coo_grid(ceildiv(nwarps, warps_in_block), ceildiv(b_ncols, config::warp_size)); hipLaunchKernelGGL( - abstract_spmm, dim3(coo_grid), dim3(coo_block), 0, 0, nnz, - num_elems, as_hip_type(alpha->get_const_values()), + abstract_spmm, coo_grid, coo_block, 0, 0, nnz, num_elems, + as_hip_type(alpha->get_const_values()), as_hip_type(a->get_const_values()), a->get_const_col_idxs(), as_hip_type(a->get_const_row_idxs()), b_ncols, as_hip_type(b->get_const_values()), b->get_stride(), diff --git a/hip/matrix/csr_kernels.hip.cpp b/hip/matrix/csr_kernels.hip.cpp index be4d51fe1c0..be63a3f2ff1 100644 --- a/hip/matrix/csr_kernels.hip.cpp +++ b/hip/matrix/csr_kernels.hip.cpp @@ -117,8 +117,8 @@ void merge_path_spmv(syn::value_list, const IndexType total = a->get_size()[0] + a->get_num_stored_elements(); const IndexType grid_num = ceildiv(total, spmv_block_size * items_per_thread); - const dim3 grid(grid_num); - const dim3 block(spmv_block_size); + const auto grid = grid_num; + const auto block = spmv_block_size; Array row_out(exec, grid_num); Array val_out(exec, grid_num); @@ -129,17 +129,15 @@ void merge_path_spmv(syn::value_list, hipLaunchKernelGGL( HIP_KERNEL_NAME( kernel::abstract_merge_path_spmv), - dim3(grid), dim3(block), 0, 0, - static_cast(a->get_size()[0]), + grid, block, 0, 0, static_cast(a->get_size()[0]), as_hip_type(a->get_const_values()), a->get_const_col_idxs(), as_hip_type(a->get_const_row_ptrs()), as_hip_type(a->get_const_srow()), as_hip_type(b_vals), b->get_stride(), as_hip_type(c_vals), c->get_stride(), as_hip_type(row_out.get_data()), as_hip_type(val_out.get_data())); - hipLaunchKernelGGL(kernel::abstract_reduce, dim3(1), - dim3(spmv_block_size), 0, 0, grid_num, - as_hip_type(val_out.get_data()), + hipLaunchKernelGGL(kernel::abstract_reduce, 1, spmv_block_size, 0, + 0, grid_num, as_hip_type(val_out.get_data()), as_hip_type(row_out.get_data()), as_hip_type(c_vals), c->get_stride()); @@ -149,8 +147,7 @@ void merge_path_spmv(syn::value_list, hipLaunchKernelGGL( HIP_KERNEL_NAME( kernel::abstract_merge_path_spmv), - dim3(grid), dim3(block), 0, 0, - static_cast(a->get_size()[0]), + grid, block, 0, 0, static_cast(a->get_size()[0]), as_hip_type(alpha->get_const_values()), as_hip_type(a->get_const_values()), a->get_const_col_idxs(), as_hip_type(a->get_const_row_ptrs()), @@ -159,9 +156,8 @@ void merge_path_spmv(syn::value_list, as_hip_type(c_vals), c->get_stride(), as_hip_type(row_out.get_data()), as_hip_type(val_out.get_data())); - hipLaunchKernelGGL(kernel::abstract_reduce, dim3(1), - dim3(spmv_block_size), 0, 0, grid_num, - as_hip_type(val_out.get_data()), + hipLaunchKernelGGL(kernel::abstract_reduce, 1, spmv_block_size, 0, + 0, grid_num, as_hip_type(val_out.get_data()), as_hip_type(row_out.get_data()), as_hip_type(alpha->get_const_values()), as_hip_type(c_vals), c->get_stride()); @@ -244,12 +240,12 @@ void classical_spmv(syn::value_list, std::min(ceildiv(a->get_size()[0], spmv_block_size / subwarp_size), int64(nwarps / warps_in_block)); const dim3 grid(gridx, b->get_size()[1]); - const dim3 block(spmv_block_size); + const auto block = spmv_block_size; if (alpha == nullptr && beta == nullptr) { hipLaunchKernelGGL( HIP_KERNEL_NAME(kernel::abstract_classical_spmv), - dim3(grid), dim3(block), 0, 0, a->get_size()[0], + grid, block, 0, 0, a->get_size()[0], as_hip_type(a->get_const_values()), a->get_const_col_idxs(), as_hip_type(a->get_const_row_ptrs()), as_hip_type(b->get_const_values()), b->get_stride(), @@ -258,7 +254,7 @@ void classical_spmv(syn::value_list, } else if (alpha != nullptr && beta != nullptr) { hipLaunchKernelGGL( HIP_KERNEL_NAME(kernel::abstract_classical_spmv), - dim3(grid), dim3(block), 0, 0, a->get_size()[0], + grid, block, 0, 0, a->get_size()[0], as_hip_type(alpha->get_const_values()), as_hip_type(a->get_const_values()), a->get_const_col_idxs(), as_hip_type(a->get_const_row_ptrs()), @@ -290,8 +286,8 @@ void spmv(std::shared_ptr exec, const dim3 csr_grid(ceildiv(nwarps, warps_in_block), b->get_size()[1]); hipLaunchKernelGGL( - kernel::abstract_spmv, dim3(csr_grid), dim3(csr_block), 0, 0, - nwarps, static_cast(a->get_size()[0]), + kernel::abstract_spmv, csr_grid, csr_block, 0, 0, nwarps, + static_cast(a->get_size()[0]), as_hip_type(a->get_const_values()), a->get_const_col_idxs(), as_hip_type(a->get_const_row_ptrs()), as_hip_type(a->get_const_srow()), @@ -384,8 +380,8 @@ void advanced_spmv(std::shared_ptr exec, const dim3 csr_grid(ceildiv(nwarps, warps_in_block), b->get_size()[1]); hipLaunchKernelGGL( - kernel::abstract_spmv, dim3(csr_grid), dim3(csr_block), 0, 0, - nwarps, static_cast(a->get_size()[0]), + kernel::abstract_spmv, csr_grid, csr_block, 0, 0, nwarps, + static_cast(a->get_size()[0]), as_hip_type(alpha->get_const_values()), as_hip_type(a->get_const_values()), a->get_const_col_idxs(), as_hip_type(a->get_const_row_ptrs()), @@ -555,9 +551,8 @@ void spgeam(syn::value_list, auto subwarps_per_block = default_block_size / subwarp_size; auto num_blocks = ceildiv(m, subwarps_per_block); hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::spgeam_nnz), - dim3(num_blocks), dim3(default_block_size), 0, 0, - a_row_ptrs, a_col_idxs, b_row_ptrs, b_col_idxs, m, - c_row_ptrs); + num_blocks, default_block_size, 0, 0, a_row_ptrs, + a_col_idxs, b_row_ptrs, b_col_idxs, m, c_row_ptrs); // build row pointers components::prefix_sum(exec, c_row_ptrs, m + 1); @@ -569,12 +564,11 @@ void spgeam(syn::value_list, c_builder.get_value_array().resize_and_reset(c_nnz); auto c_col_idxs = c->get_col_idxs(); auto c_vals = c->get_values(); - hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::spgeam), - dim3(num_blocks), dim3(default_block_size), 0, 0, - as_hip_type(alpha), a_row_ptrs, a_col_idxs, - as_hip_type(a_vals), as_hip_type(beta), b_row_ptrs, - b_col_idxs, as_hip_type(b_vals), m, c_row_ptrs, - c_col_idxs, as_hip_type(c_vals)); + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::spgeam), num_blocks, + default_block_size, 0, 0, as_hip_type(alpha), a_row_ptrs, a_col_idxs, + as_hip_type(a_vals), as_hip_type(beta), b_row_ptrs, b_col_idxs, + as_hip_type(b_vals), m, c_row_ptrs, c_col_idxs, as_hip_type(c_vals)); } GKO_ENABLE_IMPLEMENTATION_SELECTION(select_spgeam, spgeam); @@ -717,10 +711,10 @@ void fill_in_dense(std::shared_ptr exec, const auto vals = source->get_const_values(); auto grid_dim = ceildiv(num_rows, default_block_size); - hipLaunchKernelGGL( - kernel::fill_in_dense, dim3(grid_dim), dim3(default_block_size), 0, 0, - num_rows, as_hip_type(row_ptrs), as_hip_type(col_idxs), - as_hip_type(vals), stride, as_hip_type(result->get_values())); + hipLaunchKernelGGL(kernel::fill_in_dense, grid_dim, default_block_size, 0, + 0, num_rows, as_hip_type(row_ptrs), + as_hip_type(col_idxs), as_hip_type(vals), stride, + as_hip_type(result->get_values())); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -756,9 +750,9 @@ void conj_transpose(std::shared_ptr exec, matrix::Csr* trans) { if (hipsparse::is_supported::value) { - const dim3 block_size(default_block_size, 1, 1); - const dim3 grid_size( - ceildiv(trans->get_num_stored_elements(), block_size.x), 1, 1); + const auto block_size = default_block_size; + const auto grid_size = + ceildiv(trans->get_num_stored_elements(), block_size); hipsparseAction_t copyValues = HIPSPARSE_ACTION_NUMERIC; hipsparseIndexBase_t idxBase = HIPSPARSE_INDEX_BASE_ZERO; @@ -770,8 +764,8 @@ void conj_transpose(std::shared_ptr exec, orig->get_const_col_idxs(), trans->get_values(), trans->get_row_ptrs(), trans->get_col_idxs(), copyValues, idxBase); - hipLaunchKernelGGL(conjugate_kernel, dim3(grid_size), dim3(block_size), - 0, 0, trans->get_num_stored_elements(), + hipLaunchKernelGGL(conjugate_kernel, grid_size, block_size, 0, 0, + trans->get_num_stored_elements(), as_hip_type(trans->get_values())); } else { GKO_NOT_IMPLEMENTED; @@ -873,8 +867,8 @@ void calculate_nonzeros_per_row_in_span( auto col_idxs = source->get_const_col_idxs(); auto grid_dim = ceildiv(row_span.length(), default_block_size); - hipLaunchKernelGGL(kernel::calculate_nnz_per_row_in_span, dim3(grid_dim), - dim3(default_block_size), 0, 0, row_span, col_span, + hipLaunchKernelGGL(kernel::calculate_nnz_per_row_in_span, grid_dim, + default_block_size, 0, 0, row_span, col_span, as_hip_type(row_ptrs), as_hip_type(col_idxs), as_hip_type(row_nnz->get_data())); } @@ -899,9 +893,9 @@ void compute_submatrix(std::shared_ptr exec, auto num_nnz = source->get_num_stored_elements(); grid_dim = ceildiv(num_nnz, default_block_size); hipLaunchKernelGGL( - kernel::compute_submatrix_idxs_and_vals, dim3(grid_dim), - dim3(default_block_size), 0, 0, num_rows, num_cols, num_nnz, row_offset, - col_offset, as_hip_type(source->get_const_row_ptrs()), + kernel::compute_submatrix_idxs_and_vals, grid_dim, default_block_size, + 0, 0, num_rows, num_cols, num_nnz, row_offset, col_offset, + as_hip_type(source->get_const_row_ptrs()), as_hip_type(source->get_const_col_idxs()), as_hip_type(source->get_const_values()), as_hip_type(result->get_const_row_ptrs()), @@ -971,10 +965,10 @@ void is_sorted_by_column_index( auto block_size = default_block_size; auto num_rows = static_cast(to_check->get_size()[0]); auto num_blocks = ceildiv(num_rows, block_size); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel::check_unsorted), dim3(num_blocks), - dim3(block_size), 0, 0, to_check->get_const_row_ptrs(), - to_check->get_const_col_idxs(), num_rows, gpu_array.get_data()); + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::check_unsorted), num_blocks, + block_size, 0, 0, to_check->get_const_row_ptrs(), + to_check->get_const_col_idxs(), num_rows, + gpu_array.get_data()); cpu_array = gpu_array; } @@ -997,11 +991,10 @@ void extract_diagonal(std::shared_ptr exec, const auto orig_col_idxs = orig->get_const_col_idxs(); auto diag_values = diag->get_values(); - hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::extract_diagonal), - dim3(num_blocks), dim3(default_block_size), 0, 0, - diag_size, nnz, as_hip_type(orig_values), - as_hip_type(orig_row_ptrs), as_hip_type(orig_col_idxs), - as_hip_type(diag_values)); + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::extract_diagonal), num_blocks, + default_block_size, 0, 0, diag_size, nnz, + as_hip_type(orig_values), as_hip_type(orig_row_ptrs), + as_hip_type(orig_col_idxs), as_hip_type(diag_values)); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_EXTRACT_DIAGONAL); diff --git a/hip/matrix/dense_kernels.hip.cpp b/hip/matrix/dense_kernels.hip.cpp index 6a48c229964..6e7dd34bfbc 100644 --- a/hip/matrix/dense_kernels.hip.cpp +++ b/hip/matrix/dense_kernels.hip.cpp @@ -152,10 +152,10 @@ void convert_to_coo(std::shared_ptr exec, const auto grid_dim = ceildiv(num_rows, default_block_size / config::warp_size); if (grid_dim > 0) { - hipLaunchKernelGGL(kernel::fill_in_coo, dim3(grid_dim), - dim3(default_block_size), 0, 0, num_rows, num_cols, - stride, as_hip_type(source->get_const_values()), - row_ptrs, row_idxs, col_idxs, as_hip_type(values)); + hipLaunchKernelGGL(kernel::fill_in_coo, grid_dim, default_block_size, 0, + 0, num_rows, num_cols, stride, + as_hip_type(source->get_const_values()), row_ptrs, + row_idxs, col_idxs, as_hip_type(values)); } } @@ -181,8 +181,8 @@ void convert_to_csr(std::shared_ptr exec, ceildiv(num_rows, default_block_size / config::warp_size); if (grid_dim > 0) { hipLaunchKernelGGL( - kernel::fill_in_csr, dim3(grid_dim), dim3(default_block_size), 0, 0, - num_rows, num_cols, stride, as_hip_type(source->get_const_values()), + kernel::fill_in_csr, grid_dim, default_block_size, 0, 0, num_rows, + num_cols, stride, as_hip_type(source->get_const_values()), as_hip_type(row_ptrs), as_hip_type(col_idxs), as_hip_type(values)); } } @@ -210,10 +210,9 @@ void convert_to_ell(std::shared_ptr exec, ceildiv(num_rows, default_block_size / config::warp_size); if (grid_dim > 0) { hipLaunchKernelGGL( - kernel::fill_in_ell, dim3(grid_dim), dim3(default_block_size), 0, 0, - num_rows, num_cols, source_stride, - as_hip_type(source->get_const_values()), max_nnz_per_row, - result_stride, col_idxs, as_hip_type(values)); + kernel::fill_in_ell, grid_dim, default_block_size, 0, 0, num_rows, + num_cols, source_stride, as_hip_type(source->get_const_values()), + max_nnz_per_row, result_stride, col_idxs, as_hip_type(values)); } } @@ -273,11 +272,11 @@ void convert_to_sellp(std::shared_ptr exec, auto grid_dim = ceildiv(num_rows, default_block_size / config::warp_size); if (grid_dim > 0) { - hipLaunchKernelGGL( - kernel::fill_in_sellp, dim3(grid_dim), dim3(default_block_size), 0, - 0, num_rows, num_cols, slice_size, stride, - as_hip_type(source->get_const_values()), as_hip_type(slice_lengths), - as_hip_type(slice_sets), as_hip_type(col_idxs), as_hip_type(vals)); + hipLaunchKernelGGL(kernel::fill_in_sellp, grid_dim, default_block_size, + 0, 0, num_rows, num_cols, slice_size, stride, + as_hip_type(source->get_const_values()), + as_hip_type(slice_lengths), as_hip_type(slice_sets), + as_hip_type(col_idxs), as_hip_type(vals)); } } diff --git a/hip/matrix/ell_kernels.hip.cpp b/hip/matrix/ell_kernels.hip.cpp index ad5d5132087..6f7e882f429 100644 --- a/hip/matrix/ell_kernels.hip.cpp +++ b/hip/matrix/ell_kernels.hip.cpp @@ -156,7 +156,7 @@ void abstract_spmv(syn::value_list, int num_worker_per_row, if (alpha == nullptr && beta == nullptr) { hipLaunchKernelGGL( HIP_KERNEL_NAME(kernel::spmv), - dim3(grid_size), dim3(block_size), 0, 0, nrows, num_worker_per_row, + grid_size, block_size, 0, 0, nrows, num_worker_per_row, acc::as_hip_range(a_vals), a->get_const_col_idxs(), stride, num_stored_elements_per_row, acc::as_hip_range(b_vals), as_hip_type(c->get_values()), c->get_stride()); @@ -165,7 +165,7 @@ void abstract_spmv(syn::value_list, int num_worker_per_row, std::array{1}, alpha->get_const_values()); hipLaunchKernelGGL( HIP_KERNEL_NAME(kernel::spmv), - dim3(grid_size), dim3(block_size), 0, 0, nrows, num_worker_per_row, + grid_size, block_size, 0, 0, nrows, num_worker_per_row, acc::as_hip_range(alpha_val), acc::as_hip_range(a_vals), a->get_const_col_idxs(), stride, num_stored_elements_per_row, acc::as_hip_range(b_vals), as_hip_type(beta->get_const_values()), diff --git a/hip/matrix/sellp_kernels.hip.cpp b/hip/matrix/sellp_kernels.hip.cpp index af381e4b906..507a6f9ef2a 100644 --- a/hip/matrix/sellp_kernels.hip.cpp +++ b/hip/matrix/sellp_kernels.hip.cpp @@ -73,12 +73,12 @@ void spmv(std::shared_ptr exec, const matrix::Sellp* a, const matrix::Dense* b, matrix::Dense* c) { - const dim3 blockSize(default_block_size); + const auto blockSize = default_block_size; const dim3 gridSize(ceildiv(a->get_size()[0], default_block_size), b->get_size()[1]); hipLaunchKernelGGL( - spmv_kernel, dim3(gridSize), dim3(blockSize), 0, 0, a->get_size()[0], + spmv_kernel, gridSize, blockSize, 0, 0, a->get_size()[0], b->get_size()[1], b->get_stride(), c->get_stride(), a->get_slice_size(), a->get_const_slice_sets(), as_hip_type(a->get_const_values()), a->get_const_col_idxs(), as_hip_type(b->get_const_values()), @@ -96,15 +96,14 @@ void advanced_spmv(std::shared_ptr exec, const matrix::Dense* beta, matrix::Dense* c) { - const dim3 blockSize(default_block_size); + const auto blockSize = default_block_size; const dim3 gridSize(ceildiv(a->get_size()[0], default_block_size), b->get_size()[1]); hipLaunchKernelGGL( - advanced_spmv_kernel, dim3(gridSize), dim3(blockSize), 0, 0, - a->get_size()[0], b->get_size()[1], b->get_stride(), c->get_stride(), - a->get_slice_size(), a->get_const_slice_sets(), - as_hip_type(alpha->get_const_values()), + advanced_spmv_kernel, gridSize, blockSize, 0, 0, a->get_size()[0], + b->get_size()[1], b->get_stride(), c->get_stride(), a->get_slice_size(), + a->get_const_slice_sets(), as_hip_type(alpha->get_const_values()), as_hip_type(a->get_const_values()), a->get_const_col_idxs(), as_hip_type(b->get_const_values()), as_hip_type(beta->get_const_values()), as_hip_type(c->get_values())); diff --git a/hip/multigrid/amgx_pgm_kernels.hip.cpp b/hip/multigrid/amgx_pgm_kernels.hip.cpp index 1c0f49a2f8c..a0a16da9659 100644 --- a/hip/multigrid/amgx_pgm_kernels.hip.cpp +++ b/hip/multigrid/amgx_pgm_kernels.hip.cpp @@ -81,10 +81,10 @@ void match_edge(std::shared_ptr exec, Array& agg) { const auto num = agg.get_num_elems(); - const dim3 grid(ceildiv(num, default_block_size)); - hipLaunchKernelGGL(kernel::match_edge_kernel, dim3(grid), - dim3(default_block_size), 0, 0, num, - strongest_neighbor.get_const_data(), agg.get_data()); + const auto grid = ceildiv(num, default_block_size); + hipLaunchKernelGGL(kernel::match_edge_kernel, grid, default_block_size, 0, + 0, num, strongest_neighbor.get_const_data(), + agg.get_data()); } GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_AMGX_PGM_MATCH_EDGE_KERNEL); @@ -95,9 +95,8 @@ void count_unagg(std::shared_ptr exec, const Array& agg, IndexType* num_unagg) { Array active_agg(exec, agg.get_num_elems()); - const dim3 grid(ceildiv(active_agg.get_num_elems(), default_block_size)); - hipLaunchKernelGGL(kernel::activate_kernel, dim3(grid), - dim3(default_block_size), 0, 0, + const auto grid = ceildiv(active_agg.get_num_elems(), default_block_size); + hipLaunchKernelGGL(kernel::activate_kernel, grid, default_block_size, 0, 0, active_agg.get_num_elems(), agg.get_const_data(), active_agg.get_data()); *num_unagg = reduce_add_array(exec, active_agg.get_num_elems(), @@ -113,14 +112,12 @@ void renumber(std::shared_ptr exec, Array& agg, { const auto num = agg.get_num_elems(); Array agg_map(exec, num + 1); - const dim3 grid(ceildiv(num, default_block_size)); - hipLaunchKernelGGL(kernel::fill_agg_kernel, dim3(grid), - dim3(default_block_size), 0, 0, num, - agg.get_const_data(), agg_map.get_data()); + const auto grid = ceildiv(num, default_block_size); + hipLaunchKernelGGL(kernel::fill_agg_kernel, grid, default_block_size, 0, 0, + num, agg.get_const_data(), agg_map.get_data()); components::prefix_sum(exec, agg_map.get_data(), agg_map.get_num_elems()); - hipLaunchKernelGGL(kernel::renumber_kernel, dim3(grid), - dim3(default_block_size), 0, 0, num, - agg_map.get_const_data(), agg.get_data()); + hipLaunchKernelGGL(kernel::renumber_kernel, grid, default_block_size, 0, 0, + num, agg_map.get_const_data(), agg.get_data()); *num_agg = exec->copy_val_to_host(agg_map.get_const_data() + num); } @@ -135,13 +132,12 @@ void find_strongest_neighbor( Array& strongest_neighbor) { const auto num = agg.get_num_elems(); - const dim3 grid(ceildiv(num, default_block_size)); - hipLaunchKernelGGL(kernel::find_strongest_neighbor_kernel, dim3(grid), - dim3(default_block_size), 0, 0, num, - weight_mtx->get_const_row_ptrs(), - weight_mtx->get_const_col_idxs(), - weight_mtx->get_const_values(), diag->get_const_values(), - agg.get_data(), strongest_neighbor.get_data()); + const auto grid = ceildiv(num, default_block_size); + hipLaunchKernelGGL( + kernel::find_strongest_neighbor_kernel, grid, default_block_size, 0, 0, + num, weight_mtx->get_const_row_ptrs(), weight_mtx->get_const_col_idxs(), + weight_mtx->get_const_values(), diag->get_const_values(), + agg.get_data(), strongest_neighbor.get_data()); } GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE( @@ -156,26 +152,25 @@ void assign_to_exist_agg(std::shared_ptr exec, Array& intermediate_agg) { const auto num = agg.get_num_elems(); - const dim3 grid(ceildiv(num, default_block_size)); + const auto grid = ceildiv(num, default_block_size); if (intermediate_agg.get_num_elems() > 0) { // determinstic kernel hipLaunchKernelGGL( - kernel::assign_to_exist_agg_kernel, dim3(grid), - dim3(default_block_size), 0, 0, num, - weight_mtx->get_const_row_ptrs(), weight_mtx->get_const_col_idxs(), - weight_mtx->get_const_values(), diag->get_const_values(), - agg.get_const_data(), intermediate_agg.get_data()); + kernel::assign_to_exist_agg_kernel, grid, default_block_size, 0, 0, + num, weight_mtx->get_const_row_ptrs(), + weight_mtx->get_const_col_idxs(), weight_mtx->get_const_values(), + diag->get_const_values(), agg.get_const_data(), + intermediate_agg.get_data()); // Copy the intermediate_agg to agg agg = intermediate_agg; } else { // undeterminstic kernel - hipLaunchKernelGGL(kernel::assign_to_exist_agg_kernel, dim3(grid), - dim3(default_block_size), 0, 0, num, - weight_mtx->get_const_row_ptrs(), - weight_mtx->get_const_col_idxs(), - weight_mtx->get_const_values(), - diag->get_const_values(), agg.get_data()); + hipLaunchKernelGGL( + kernel::assign_to_exist_agg_kernel, grid, default_block_size, 0, 0, + num, weight_mtx->get_const_row_ptrs(), + weight_mtx->get_const_col_idxs(), weight_mtx->get_const_values(), + diag->get_const_values(), agg.get_data()); } } diff --git a/hip/preconditioner/isai_kernels.hip.cpp b/hip/preconditioner/isai_kernels.hip.cpp index 02bb08bab3c..e43e47f83ce 100644 --- a/hip/preconditioner/isai_kernels.hip.cpp +++ b/hip/preconditioner/isai_kernels.hip.cpp @@ -83,8 +83,8 @@ void generate_tri_inverse(std::shared_ptr exec, { const auto num_rows = input->get_size()[0]; - const dim3 block(default_block_size, 1, 1); - const dim3 grid(ceildiv(num_rows, block.x / subwarp_size), 1, 1); + const auto block = default_block_size; + const auto grid = ceildiv(num_rows, block / subwarp_size); if (lower) { hipLaunchKernelGGL( HIP_KERNEL_NAME( @@ -121,8 +121,8 @@ void generate_general_inverse(std::shared_ptr exec, { const auto num_rows = input->get_size()[0]; - const dim3 block(default_block_size, 1, 1); - const dim3 grid(ceildiv(num_rows, block.x / subwarp_size), 1, 1); + const auto block = default_block_size; + const auto grid = ceildiv(num_rows, block / subwarp_size); hipLaunchKernelGGL( HIP_KERNEL_NAME( kernel::generate_general_inverse), @@ -151,8 +151,8 @@ void generate_excess_system(std::shared_ptr exec, { const auto num_rows = input->get_size()[0]; - const dim3 block(default_block_size, 1, 1); - const dim3 grid(ceildiv(e_end - e_start, block.x / subwarp_size), 1, 1); + const auto block = default_block_size; + const auto grid = ceildiv(e_end - e_start, block / subwarp_size); hipLaunchKernelGGL( HIP_KERNEL_NAME(kernel::generate_excess_system), grid, block, 0, 0, static_cast(num_rows), @@ -174,8 +174,8 @@ void scale_excess_solution(std::shared_ptr, matrix::Dense* excess_solution, size_type e_start, size_type e_end) { - const dim3 block(default_block_size, 1, 1); - const dim3 grid(ceildiv(e_end - e_start, block.x / subwarp_size), 1, 1); + const auto block = default_block_size; + const auto grid = ceildiv(e_end - e_start, block / subwarp_size); hipLaunchKernelGGL( HIP_KERNEL_NAME(kernel::scale_excess_solution), grid, block, 0, 0, excess_block_ptrs, @@ -195,8 +195,8 @@ void scatter_excess_solution(std::shared_ptr exec, { const auto num_rows = inverse->get_size()[0]; - const dim3 block(default_block_size, 1, 1); - const dim3 grid(ceildiv(e_end - e_start, block.x / subwarp_size), 1, 1); + const auto block = default_block_size; + const auto grid = ceildiv(e_end - e_start, block / subwarp_size); hipLaunchKernelGGL( HIP_KERNEL_NAME(kernel::copy_excess_solution), grid, block, 0, 0, static_cast(num_rows), diff --git a/hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp b/hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp index bd1878e393b..f347ba00363 100644 --- a/hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp +++ b/hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp @@ -88,8 +88,8 @@ void advanced_apply( { constexpr int subwarp_size = get_larger_power(max_block_size); constexpr int blocks_per_warp = config::warp_size / subwarp_size; - const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp), - 1, 1); + const auto grid_size = + ceildiv(num_blocks, warps_per_block * blocks_per_warp); const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block); if (block_precisions) { @@ -97,17 +97,16 @@ void advanced_apply( HIP_KERNEL_NAME( kernel::advanced_adaptive_apply), - dim3(grid_size), dim3(block_size), 0, 0, as_hip_type(blocks), - storage_scheme, block_precisions, block_pointers, num_blocks, - as_hip_type(alpha), as_hip_type(b), b_stride, as_hip_type(x), - x_stride); + grid_size, block_size, 0, 0, as_hip_type(blocks), storage_scheme, + block_precisions, block_pointers, num_blocks, as_hip_type(alpha), + as_hip_type(b), b_stride, as_hip_type(x), x_stride); } else { hipLaunchKernelGGL( HIP_KERNEL_NAME(kernel::advanced_apply), - dim3(grid_size), dim3(block_size), 0, 0, as_hip_type(blocks), - storage_scheme, block_pointers, num_blocks, as_hip_type(alpha), - as_hip_type(b), b_stride, as_hip_type(x), x_stride); + grid_size, block_size, 0, 0, as_hip_type(blocks), storage_scheme, + block_pointers, num_blocks, as_hip_type(alpha), as_hip_type(b), + b_stride, as_hip_type(x), x_stride); } } diff --git a/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp b/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp index 5fd229dca89..37a6c027438 100644 --- a/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp +++ b/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp @@ -88,8 +88,8 @@ void generate(syn::value_list, { constexpr int subwarp_size = get_larger_power(max_block_size); constexpr int blocks_per_warp = config::warp_size / subwarp_size; - const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp), - 1, 1); + const auto grid_size = + ceildiv(num_blocks, warps_per_block * blocks_per_warp); const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block); if (block_precisions) { @@ -97,7 +97,7 @@ void generate(syn::value_list, HIP_KERNEL_NAME( kernel::adaptive_generate), - dim3(grid_size), dim3(block_size), 0, 0, mtx->get_size()[0], + grid_size, block_size, 0, 0, mtx->get_size()[0], mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), as_hip_type(mtx->get_const_values()), as_hip_type(accuracy), as_hip_type(block_data), storage_scheme, as_hip_type(conditioning), @@ -106,7 +106,7 @@ void generate(syn::value_list, hipLaunchKernelGGL( HIP_KERNEL_NAME(kernel::generate), - dim3(grid_size), dim3(block_size), 0, 0, mtx->get_size()[0], + grid_size, block_size, 0, 0, mtx->get_size()[0], mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), as_hip_type(mtx->get_const_values()), as_hip_type(block_data), storage_scheme, block_ptrs, num_blocks); diff --git a/hip/preconditioner/jacobi_kernels.hip.cpp b/hip/preconditioner/jacobi_kernels.hip.cpp index c2f22b4aefe..ed374915cc7 100644 --- a/hip/preconditioner/jacobi_kernels.hip.cpp +++ b/hip/preconditioner/jacobi_kernels.hip.cpp @@ -86,14 +86,14 @@ size_type find_natural_blocks(std::shared_ptr exec, Array matching_next_row(exec, mtx->get_size()[0] - 1); - const dim3 block_size(config::warp_size, 1, 1); - const dim3 grid_size( - ceildiv(mtx->get_size()[0] * config::warp_size, block_size.x), 1, 1); - hipLaunchKernelGGL(compare_adjacent_rows, dim3(grid_size), dim3(block_size), - 0, 0, mtx->get_size()[0], max_block_size, + const auto block_size = config::warp_size; + const auto grid_size = + ceildiv(mtx->get_size()[0] * config::warp_size, block_size); + hipLaunchKernelGGL(compare_adjacent_rows, grid_size, block_size, 0, 0, + mtx->get_size()[0], max_block_size, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), matching_next_row.get_data()); - hipLaunchKernelGGL(generate_natural_block_pointer, dim3(1), dim3(1), 0, 0, + hipLaunchKernelGGL(generate_natural_block_pointer, 1, 1, 0, 0, mtx->get_size()[0], max_block_size, matching_next_row.get_const_data(), block_ptrs, nums.get_data()); @@ -109,8 +109,8 @@ inline size_type agglomerate_supervariables( { Array nums(exec, 1); - hipLaunchKernelGGL(agglomerate_supervariables_kernel, dim3(1), dim3(1), 0, - 0, max_block_size, num_natural_blocks, block_ptrs, + hipLaunchKernelGGL(agglomerate_supervariables_kernel, 1, 1, 0, 0, + max_block_size, num_natural_blocks, block_ptrs, nums.get_data()); nums.set_executor(exec->get_master()); @@ -130,9 +130,9 @@ void initialize_precisions(std::shared_ptr exec, default_grid_size, static_cast(ceildiv(precisions.get_num_elems(), block_size))); hipLaunchKernelGGL(HIP_KERNEL_NAME(duplicate_array), - dim3(grid_size), dim3(block_size), 0, 0, - source.get_const_data(), source.get_num_elems(), - precisions.get_data(), precisions.get_num_elems()); + grid_size, block_size, 0, 0, source.get_const_data(), + source.get_num_elems(), precisions.get_data(), + precisions.get_num_elems()); } @@ -167,8 +167,8 @@ void transpose_jacobi( { constexpr int subwarp_size = get_larger_power(max_block_size); constexpr int blocks_per_warp = config::warp_size / subwarp_size; - const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp), - 1, 1); + const auto grid_size = + ceildiv(num_blocks, warps_per_block * blocks_per_warp); const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block); if (block_precisions) { @@ -176,16 +176,15 @@ void transpose_jacobi( HIP_KERNEL_NAME( adaptive_transpose_jacobi), - dim3(grid_size), dim3(block_size), 0, 0, as_hip_type(blocks), - storage_scheme, block_precisions, block_pointers, num_blocks, + grid_size, block_size, 0, 0, as_hip_type(blocks), storage_scheme, + block_precisions, block_pointers, num_blocks, as_hip_type(out_blocks)); } else { hipLaunchKernelGGL( HIP_KERNEL_NAME(transpose_jacobi), - dim3(grid_size), dim3(block_size), 0, 0, as_hip_type(blocks), - storage_scheme, block_pointers, num_blocks, - as_hip_type(out_blocks)); + grid_size, block_size, 0, 0, as_hip_type(blocks), storage_scheme, + block_pointers, num_blocks, as_hip_type(out_blocks)); } } diff --git a/hip/preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp b/hip/preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp index 4ebc25fd30c..1fa2f262fbf 100644 --- a/hip/preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp +++ b/hip/preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp @@ -84,24 +84,24 @@ void apply(syn::value_list, size_type num_blocks, { constexpr int subwarp_size = get_larger_power(max_block_size); constexpr int blocks_per_warp = config::warp_size / subwarp_size; - const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp), - 1, 1); + const auto grid_size = + ceildiv(num_blocks, warps_per_block * blocks_per_warp); const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block); if (block_precisions) { hipLaunchKernelGGL( HIP_KERNEL_NAME(kernel::adaptive_apply), - dim3(grid_size), dim3(block_size), 0, 0, as_hip_type(blocks), - storage_scheme, block_precisions, block_pointers, num_blocks, - as_hip_type(b), b_stride, as_hip_type(x), x_stride); + grid_size, block_size, 0, 0, as_hip_type(blocks), storage_scheme, + block_precisions, block_pointers, num_blocks, as_hip_type(b), + b_stride, as_hip_type(x), x_stride); } else { hipLaunchKernelGGL( HIP_KERNEL_NAME( kernel::apply), - dim3(grid_size), dim3(block_size), 0, 0, as_hip_type(blocks), - storage_scheme, block_pointers, num_blocks, as_hip_type(b), - b_stride, as_hip_type(x), x_stride); + grid_size, block_size, 0, 0, as_hip_type(blocks), storage_scheme, + block_pointers, num_blocks, as_hip_type(b), b_stride, + as_hip_type(x), x_stride); } } diff --git a/hip/solver/cb_gmres_kernels.hip.cpp b/hip/solver/cb_gmres_kernels.hip.cpp index 7b431d0b758..1ab7dc0f554 100644 --- a/hip/solver/cb_gmres_kernels.hip.cpp +++ b/hip/solver/cb_gmres_kernels.hip.cpp @@ -83,8 +83,8 @@ constexpr int default_dot_size = default_dot_dim * default_dot_dim; template void zero_matrix(size_type m, size_type n, size_type stride, ValueType* array) { - const dim3 block_size(default_block_size, 1, 1); - const dim3 grid_size(ceildiv(n, block_size.x), 1, 1); + const auto block_size = default_block_size; + const auto grid_size = ceildiv(n, block_size); hipLaunchKernelGGL(zero_matrix_kernel, grid_size, block_size, 0, 0, m, n, stride, as_hip_type(array)); } @@ -100,8 +100,8 @@ void initialize_1(std::shared_ptr exec, { const auto num_threads = std::max(b->get_size()[0] * b->get_stride(), krylov_dim * b->get_size()[1]); - const dim3 grid_dim(ceildiv(num_threads, default_block_size), 1, 1); - const dim3 block_dim(default_block_size, 1, 1); + const auto grid_dim = ceildiv(num_threads, default_block_size); + const auto block_dim = default_block_size; constexpr auto block_size = default_block_size; hipLaunchKernelGGL( @@ -134,9 +134,9 @@ void initialize_2(std::shared_ptr exec, const auto krylov_stride = gko::cb_gmres::helper_functions_accessor::get_stride( krylov_bases); - const dim3 grid_dim_1( - ceildiv((krylov_dim + 1) * krylov_stride[0], default_block_size), 1, 1); - const dim3 block_dim(default_block_size, 1, 1); + const auto grid_dim_1 = + ceildiv((krylov_dim + 1) * krylov_stride[0], default_block_size); + const auto block_dim = default_block_size; constexpr auto block_size = default_block_size; const auto stride_arnoldi = arnoldi_norm->get_stride(); @@ -164,16 +164,16 @@ void initialize_2(std::shared_ptr exec, if (gko::cb_gmres::detail::has_3d_scaled_accessor::value) { hipLaunchKernelGGL( set_scalar_kernel, - dim3(ceildiv(num_rhs * (krylov_dim + 1), default_block_size)), - dim3(default_block_size), 0, 0, num_rhs, krylov_dim + 1, + ceildiv(num_rhs * (krylov_dim + 1), default_block_size), + default_block_size, 0, 0, num_rhs, krylov_dim + 1, as_hip_type(residual_norm->get_const_values()), residual_norm->get_stride(), as_hip_type(arnoldi_norm->get_const_values() + 2 * stride_arnoldi), stride_arnoldi, acc::as_hip_range(krylov_bases)); } - const dim3 grid_dim_2( - ceildiv(num_rows * krylov_stride[1], default_block_size), 1, 1); + const auto grid_dim_2 = + ceildiv(num_rows * krylov_stride[1], default_block_size); hipLaunchKernelGGL(initialize_2_2_kernel, grid_dim_2, block_dim, 0, 0, residual->get_size()[0], residual->get_size()[1], as_hip_type(residual->get_const_values()), @@ -222,7 +222,7 @@ void finish_arnoldi_CGS(std::shared_ptr exec, // further investigation. const dim3 grid_size_iters_single(exec->get_num_multiprocessor() * 2, iter + 1); - const dim3 block_size_iters_single(singledot_block_size); + const auto block_size_iters_single = singledot_block_size; size_type num_reorth_host; components::fill_array(exec, arnoldi_norm->get_values(), dim_size[1], @@ -257,8 +257,8 @@ void finish_arnoldi_CGS(std::shared_ptr exec, // end hipLaunchKernelGGL( update_next_krylov_kernel, - dim3(ceildiv(dim_size[0] * stride_next_krylov, default_block_size)), - dim3(default_block_size), 0, 0, iter + 1, dim_size[0], dim_size[1], + ceildiv(dim_size[0] * stride_next_krylov, default_block_size), + default_block_size, 0, 0, iter + 1, dim_size[0], dim_size[1], as_hip_type(next_krylov_basis->get_values()), stride_next_krylov, acc::as_hip_range(krylov_bases), as_hip_type(hessenberg_iter->get_const_values()), stride_hessenberg, @@ -285,9 +285,8 @@ void finish_arnoldi_CGS(std::shared_ptr exec, components::fill_array(exec, num_reorth->get_data(), 1, zero()); hipLaunchKernelGGL( check_arnoldi_norms, - dim3(ceildiv(dim_size[1], default_block_size)), - dim3(default_block_size), 0, 0, dim_size[1], - as_hip_type(arnoldi_norm->get_values()), stride_arnoldi, + ceildiv(dim_size[1], default_block_size), default_block_size, 0, 0, + dim_size[1], as_hip_type(arnoldi_norm->get_values()), stride_arnoldi, as_hip_type(hessenberg_iter->get_values()), stride_hessenberg, iter + 1, acc::as_hip_range(krylov_bases), as_hip_type(stop_status), as_hip_type(reorth_status), as_hip_type(num_reorth->get_data())); @@ -318,8 +317,8 @@ void finish_arnoldi_CGS(std::shared_ptr exec, // end hipLaunchKernelGGL( update_next_krylov_and_add_kernel, - dim3(ceildiv(dim_size[0] * stride_next_krylov, default_block_size)), - dim3(default_block_size), 0, 0, iter + 1, dim_size[0], dim_size[1], + ceildiv(dim_size[0] * stride_next_krylov, default_block_size), + default_block_size, 0, 0, iter + 1, dim_size[0], dim_size[1], as_hip_type(next_krylov_basis->get_values()), stride_next_krylov, acc::as_hip_range(krylov_bases), as_hip_type(hessenberg_iter->get_values()), stride_hessenberg, @@ -349,12 +348,12 @@ void finish_arnoldi_CGS(std::shared_ptr exec, zero()); hipLaunchKernelGGL( check_arnoldi_norms, - dim3(ceildiv(dim_size[1], default_block_size)), - dim3(default_block_size), 0, 0, dim_size[1], - as_hip_type(arnoldi_norm->get_values()), stride_arnoldi, - as_hip_type(hessenberg_iter->get_values()), stride_hessenberg, - iter + 1, acc::as_hip_range(krylov_bases), as_hip_type(stop_status), - as_hip_type(reorth_status), as_hip_type(num_reorth->get_data())); + ceildiv(dim_size[1], default_block_size), default_block_size, 0, 0, + dim_size[1], as_hip_type(arnoldi_norm->get_values()), + stride_arnoldi, as_hip_type(hessenberg_iter->get_values()), + stride_hessenberg, iter + 1, acc::as_hip_range(krylov_bases), + as_hip_type(stop_status), as_hip_type(reorth_status), + as_hip_type(num_reorth->get_data())); num_reorth_host = exec->copy_val_to_host(num_reorth->get_const_data()); // num_reorth_host := number of next_krylov vector to be // reorthogonalization @@ -362,8 +361,8 @@ void finish_arnoldi_CGS(std::shared_ptr exec, hipLaunchKernelGGL( update_krylov_next_krylov_kernel, - dim3(ceildiv(dim_size[0] * stride_next_krylov, default_block_size)), - dim3(default_block_size), 0, 0, iter, dim_size[0], dim_size[1], + ceildiv(dim_size[0] * stride_next_krylov, default_block_size), + default_block_size, 0, 0, iter, dim_size[0], dim_size[1], as_hip_type(next_krylov_basis->get_values()), stride_next_krylov, acc::as_hip_range(krylov_bases), as_hip_type(hessenberg_iter->get_const_values()), stride_hessenberg, @@ -385,9 +384,9 @@ void givens_rotation(std::shared_ptr exec, // TODO: tune block_size for optimal performance constexpr auto block_size = default_block_size; const auto num_cols = hessenberg_iter->get_size()[1]; - const dim3 block_dim{block_size, 1, 1}; - const dim3 grid_dim{ - static_cast(ceildiv(num_cols, block_size)), 1, 1}; + const auto block_dim = block_size; + const auto grid_dim = + static_cast(ceildiv(num_cols, block_size)); hipLaunchKernelGGL( givens_rotation_kernel, grid_dim, block_dim, 0, 0, @@ -418,10 +417,9 @@ void step_1(std::shared_ptr exec, { hipLaunchKernelGGL( increase_final_iteration_numbers_kernel, - dim3(static_cast( - ceildiv(final_iter_nums->get_num_elems(), default_block_size))), - dim3(default_block_size), 0, 0, - as_hip_type(final_iter_nums->get_data()), + static_cast( + ceildiv(final_iter_nums->get_num_elems(), default_block_size)), + default_block_size, 0, 0, as_hip_type(final_iter_nums->get_data()), as_hip_type(stop_status->get_const_data()), final_iter_nums->get_num_elems()); finish_arnoldi_CGS(exec, next_krylov_basis, krylov_bases, hessenberg_iter, @@ -444,9 +442,9 @@ void solve_upper_triangular( // TODO: tune block_size for optimal performance constexpr auto block_size = default_block_size; const auto num_rhs = residual_norm_collection->get_size()[1]; - const dim3 block_dim{block_size, 1, 1}; - const dim3 grid_dim{static_cast(ceildiv(num_rhs, block_size)), - 1, 1}; + const auto block_dim = block_size; + const auto grid_dim = + static_cast(ceildiv(num_rhs, block_size)); hipLaunchKernelGGL( solve_upper_triangular_kernel, grid_dim, block_dim, 0, 0, @@ -471,11 +469,9 @@ void calculate_qy(ConstAccessor3d krylov_bases, size_type num_krylov_bases, before_preconditioner->get_stride(); constexpr auto block_size = default_block_size; - const dim3 grid_dim{ - static_cast( - ceildiv(num_rows * stride_before_preconditioner, block_size)), - 1, 1}; - const dim3 block_dim{block_size, 1, 1}; + const auto grid_dim = static_cast( + ceildiv(num_rows * stride_before_preconditioner, block_size)); + const auto block_dim = block_size; hipLaunchKernelGGL(calculate_Qy_kernel, grid_dim, block_dim, 0, diff --git a/hip/solver/gmres_kernels.hip.cpp b/hip/solver/gmres_kernels.hip.cpp index 99600dda307..5945385e732 100644 --- a/hip/solver/gmres_kernels.hip.cpp +++ b/hip/solver/gmres_kernels.hip.cpp @@ -89,13 +89,13 @@ void initialize_1(std::shared_ptr exec, { const auto num_threads = std::max(b->get_size()[0] * b->get_stride(), krylov_dim * b->get_size()[1]); - const dim3 grid_dim(ceildiv(num_threads, default_block_size), 1, 1); - const dim3 block_dim(default_block_size, 1, 1); + const auto grid_dim = ceildiv(num_threads, default_block_size); + const auto block_dim = default_block_size; constexpr auto block_size = default_block_size; hipLaunchKernelGGL( - HIP_KERNEL_NAME(initialize_1_kernel), dim3(grid_dim), - dim3(block_dim), 0, 0, b->get_size()[0], b->get_size()[1], krylov_dim, + HIP_KERNEL_NAME(initialize_1_kernel), grid_dim, block_dim, + 0, 0, b->get_size()[0], b->get_size()[1], krylov_dim, as_hip_type(b->get_const_values()), b->get_stride(), as_hip_type(residual->get_values()), residual->get_stride(), as_hip_type(givens_sin->get_values()), givens_sin->get_stride(), @@ -116,20 +116,18 @@ void initialize_2(std::shared_ptr exec, { const auto num_rows = residual->get_size()[0]; const auto num_rhs = residual->get_size()[1]; - const dim3 grid_dim_1( + const auto grid_dim_1 = ceildiv(krylov_bases->get_size()[0] * krylov_bases->get_stride(), - default_block_size), - 1, 1); - const dim3 block_dim(default_block_size, 1, 1); + default_block_size); + const auto block_dim = default_block_size; constexpr auto block_size = default_block_size; kernels::hip::dense::compute_norm2(exec, residual, residual_norm); - const dim3 grid_dim_2(ceildiv(num_rows * num_rhs, default_block_size), 1, - 1); + const auto grid_dim_2 = ceildiv(num_rows * num_rhs, default_block_size); hipLaunchKernelGGL( - HIP_KERNEL_NAME(initialize_2_2_kernel), dim3(grid_dim_2), - dim3(block_dim), 0, 0, residual->get_size()[0], residual->get_size()[1], + HIP_KERNEL_NAME(initialize_2_2_kernel), grid_dim_2, + block_dim, 0, 0, residual->get_size()[0], residual->get_size()[1], as_hip_type(residual->get_const_values()), residual->get_stride(), as_hip_type(residual_norm->get_const_values()), as_hip_type(residual_norm_collection->get_values()), @@ -167,12 +165,12 @@ void finish_arnoldi(std::shared_ptr exec, size_type num_rows, components::fill_array( exec, hessenberg_iter->get_values() + k * stride_hessenberg, hessenberg_iter->get_size()[1], zero()); - hipLaunchKernelGGL( - multidot_kernel, dim3(grid_size), dim3(block_size), 0, 0, k, - num_rows, hessenberg_iter->get_size()[1], - as_hip_type(k_krylov_bases), as_hip_type(next_krylov_basis), - stride_krylov, as_hip_type(hessenberg_iter->get_values()), - stride_hessenberg, as_hip_type(stop_status)); + hipLaunchKernelGGL(multidot_kernel, grid_size, block_size, 0, 0, k, + num_rows, hessenberg_iter->get_size()[1], + as_hip_type(k_krylov_bases), + as_hip_type(next_krylov_basis), stride_krylov, + as_hip_type(hessenberg_iter->get_values()), + stride_hessenberg, as_hip_type(stop_status)); } else { hipblas::dot(exec->get_hipblas_handle(), num_rows, k_krylov_bases, stride_krylov, next_krylov_basis, stride_krylov, @@ -180,8 +178,8 @@ void finish_arnoldi(std::shared_ptr exec, size_type num_rows, } hipLaunchKernelGGL( HIP_KERNEL_NAME(update_next_krylov_kernel), - dim3(ceildiv(num_rows * stride_krylov, default_block_size)), - dim3(default_block_size), 0, 0, k, num_rows, + ceildiv(num_rows * stride_krylov, default_block_size), + default_block_size, 0, 0, k, num_rows, hessenberg_iter->get_size()[1], as_hip_type(k_krylov_bases), as_hip_type(next_krylov_basis), stride_krylov, as_hip_type(hessenberg_iter->get_const_values()), stride_hessenberg, @@ -195,16 +193,16 @@ void finish_arnoldi(std::shared_ptr exec, size_type num_rows, hipLaunchKernelGGL( HIP_KERNEL_NAME(update_hessenberg_2_kernel), - dim3(hessenberg_iter->get_size()[1]), dim3(default_block_size), 0, 0, - iter, num_rows, hessenberg_iter->get_size()[1], + hessenberg_iter->get_size()[1], default_block_size, 0, 0, iter, + num_rows, hessenberg_iter->get_size()[1], as_hip_type(next_krylov_basis), stride_krylov, as_hip_type(hessenberg_iter->get_values()), stride_hessenberg, as_hip_type(stop_status)); hipLaunchKernelGGL( HIP_KERNEL_NAME(update_krylov_kernel), - dim3(ceildiv(num_rows * stride_krylov, default_block_size)), - dim3(default_block_size), 0, 0, iter, num_rows, + ceildiv(num_rows * stride_krylov, default_block_size), + default_block_size, 0, 0, iter, num_rows, hessenberg_iter->get_size()[1], as_hip_type(next_krylov_basis), stride_krylov, as_hip_type(hessenberg_iter->get_const_values()), stride_hessenberg, as_hip_type(stop_status)); @@ -225,13 +223,13 @@ void givens_rotation(std::shared_ptr exec, // TODO: tune block_size for optimal performance constexpr auto block_size = default_block_size; const auto num_cols = hessenberg_iter->get_size()[1]; - const dim3 block_dim{block_size, 1, 1}; - const dim3 grid_dim{ - static_cast(ceildiv(num_cols, block_size)), 1, 1}; + const auto block_dim = block_size; + const auto grid_dim = + static_cast(ceildiv(num_cols, block_size)); hipLaunchKernelGGL( - HIP_KERNEL_NAME(givens_rotation_kernel), dim3(grid_dim), - dim3(block_dim), 0, 0, hessenberg_iter->get_size()[0], + HIP_KERNEL_NAME(givens_rotation_kernel), grid_dim, + block_dim, 0, 0, hessenberg_iter->get_size()[0], hessenberg_iter->get_size()[1], iter, as_hip_type(hessenberg_iter->get_values()), hessenberg_iter->get_stride(), as_hip_type(givens_sin->get_values()), @@ -256,10 +254,9 @@ void step_1(std::shared_ptr exec, size_type num_rows, { hipLaunchKernelGGL( increase_final_iteration_numbers_kernel, - dim3(static_cast( - ceildiv(final_iter_nums->get_num_elems(), default_block_size))), - dim3(default_block_size), 0, 0, - as_hip_type(final_iter_nums->get_data()), + static_cast( + ceildiv(final_iter_nums->get_num_elems(), default_block_size)), + default_block_size, 0, 0, as_hip_type(final_iter_nums->get_data()), as_hip_type(stop_status->get_const_data()), final_iter_nums->get_num_elems()); finish_arnoldi(exec, num_rows, krylov_bases, hessenberg_iter, iter, @@ -280,14 +277,14 @@ void solve_upper_triangular( // TODO: tune block_size for optimal performance constexpr auto block_size = default_block_size; const auto num_rhs = residual_norm_collection->get_size()[1]; - const dim3 block_dim{block_size, 1, 1}; - const dim3 grid_dim{static_cast(ceildiv(num_rhs, block_size)), - 1, 1}; + const auto block_dim = block_size; + const auto grid_dim = + static_cast(ceildiv(num_rhs, block_size)); hipLaunchKernelGGL( - HIP_KERNEL_NAME(solve_upper_triangular_kernel), - dim3(grid_dim), dim3(block_dim), 0, 0, hessenberg->get_size()[1], - num_rhs, as_hip_type(residual_norm_collection->get_const_values()), + HIP_KERNEL_NAME(solve_upper_triangular_kernel), grid_dim, + block_dim, 0, 0, hessenberg->get_size()[1], num_rhs, + as_hip_type(residual_norm_collection->get_const_values()), residual_norm_collection->get_stride(), as_hip_type(hessenberg->get_const_values()), hessenberg->get_stride(), as_hip_type(y->get_values()), y->get_stride(), @@ -308,21 +305,19 @@ void calculate_qy(const matrix::Dense* krylov_bases, before_preconditioner->get_stride(); constexpr auto block_size = default_block_size; - const dim3 grid_dim{ - static_cast( - ceildiv(num_rows * stride_before_preconditioner, block_size)), - 1, 1}; - const dim3 block_dim{block_size, 1, 1}; - - - hipLaunchKernelGGL( - HIP_KERNEL_NAME(calculate_Qy_kernel), dim3(grid_dim), - dim3(block_dim), 0, 0, num_rows, num_cols, num_rhs, - as_hip_type(krylov_bases->get_const_values()), - krylov_bases->get_stride(), as_hip_type(y->get_const_values()), - y->get_stride(), as_hip_type(before_preconditioner->get_values()), - stride_before_preconditioner, - as_hip_type(final_iter_nums->get_const_data())); + const auto grid_dim = static_cast( + ceildiv(num_rows * stride_before_preconditioner, block_size)); + const auto block_dim = block_size; + + + hipLaunchKernelGGL(HIP_KERNEL_NAME(calculate_Qy_kernel), + grid_dim, block_dim, 0, 0, num_rows, num_cols, num_rhs, + as_hip_type(krylov_bases->get_const_values()), + krylov_bases->get_stride(), + as_hip_type(y->get_const_values()), y->get_stride(), + as_hip_type(before_preconditioner->get_values()), + stride_before_preconditioner, + as_hip_type(final_iter_nums->get_const_data())); // Calculate qy // before_preconditioner = krylov_bases * y } diff --git a/hip/solver/multigrid_kernels.hip.cpp b/hip/solver/multigrid_kernels.hip.cpp index ff291d4a7f6..f70c4d7e38f 100644 --- a/hip/solver/multigrid_kernels.hip.cpp +++ b/hip/solver/multigrid_kernels.hip.cpp @@ -83,10 +83,10 @@ void kcycle_step_1(std::shared_ptr exec, constexpr int max_size = (1U << 31) - 1; const size_type grid_nrows = max_size / nrhs < nrows ? max_size / nrhs : nrows; - const dim3 grid(ceildiv(grid_nrows * nrhs, default_block_size)); + const auto grid = ceildiv(grid_nrows * nrhs, default_block_size); hipLaunchKernelGGL( - kernel::kcycle_step_1_kernel, dim3(grid), dim3(default_block_size), 0, - 0, nrows, nrhs, e->get_stride(), grid_nrows, + kernel::kcycle_step_1_kernel, grid, default_block_size, 0, 0, nrows, + nrhs, e->get_stride(), grid_nrows, as_hip_type(alpha->get_const_values()), as_hip_type(rho->get_const_values()), as_hip_type(v->get_const_values()), as_hip_type(g->get_values()), @@ -111,16 +111,16 @@ void kcycle_step_2(std::shared_ptr exec, constexpr int max_size = (1U << 31) - 1; const size_type grid_nrows = max_size / nrhs < nrows ? max_size / nrhs : nrows; - const dim3 grid(ceildiv(grid_nrows * nrhs, default_block_size)); - hipLaunchKernelGGL( - kernel::kcycle_step_2_kernel, dim3(grid), dim3(default_block_size), 0, - 0, nrows, nrhs, e->get_stride(), grid_nrows, - as_hip_type(alpha->get_const_values()), - as_hip_type(rho->get_const_values()), - as_hip_type(gamma->get_const_values()), - as_hip_type(beta->get_const_values()), - as_hip_type(zeta->get_const_values()), - as_hip_type(d->get_const_values()), as_hip_type(e->get_values())); + const auto grid = ceildiv(grid_nrows * nrhs, default_block_size); + hipLaunchKernelGGL(kernel::kcycle_step_2_kernel, grid, default_block_size, + 0, 0, nrows, nrhs, e->get_stride(), grid_nrows, + as_hip_type(alpha->get_const_values()), + as_hip_type(rho->get_const_values()), + as_hip_type(gamma->get_const_values()), + as_hip_type(beta->get_const_values()), + as_hip_type(zeta->get_const_values()), + as_hip_type(d->get_const_values()), + as_hip_type(e->get_values())); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_MULTIGRID_KCYCLE_STEP_2_KERNEL); @@ -136,9 +136,9 @@ void kcycle_check_stop(std::shared_ptr exec, components::fill_array(exec, dis_stop.get_data(), dis_stop.get_num_elems(), true); const auto nrhs = new_norm->get_size()[1]; - const dim3 grid(ceildiv(nrhs, default_block_size)); - hipLaunchKernelGGL(kernel::kcycle_check_stop_kernel, dim3(grid), - dim3(default_block_size), 0, 0, nrhs, + const auto grid = ceildiv(nrhs, default_block_size); + hipLaunchKernelGGL(kernel::kcycle_check_stop_kernel, grid, + default_block_size, 0, 0, nrhs, as_hip_type(old_norm->get_const_values()), as_hip_type(new_norm->get_const_values()), rel_tol, as_hip_type(dis_stop.get_data())); diff --git a/hip/stop/criterion_kernels.hip.cpp b/hip/stop/criterion_kernels.hip.cpp index 622e754a348..af2717a658d 100644 --- a/hip/stop/criterion_kernels.hip.cpp +++ b/hip/stop/criterion_kernels.hip.cpp @@ -71,13 +71,12 @@ __global__ __launch_bounds__(default_block_size) void set_all_statuses( void set_all_statuses(std::shared_ptr exec, uint8 stoppingId, bool setFinalized, Array* stop_status) { - const dim3 block_size(default_block_size, 1, 1); - const dim3 grid_size(ceildiv(stop_status->get_num_elems(), block_size.x), 1, - 1); + const auto block_size = default_block_size; + const auto grid_size = ceildiv(stop_status->get_num_elems(), block_size); - hipLaunchKernelGGL((set_all_statuses), dim3(grid_size), dim3(block_size), 0, - 0, stop_status->get_num_elems(), stoppingId, - setFinalized, as_hip_type(stop_status->get_data())); + hipLaunchKernelGGL((set_all_statuses), grid_size, block_size, 0, 0, + stop_status->get_num_elems(), stoppingId, setFinalized, + as_hip_type(stop_status->get_data())); } diff --git a/hip/stop/residual_norm_kernels.hip.cpp b/hip/stop/residual_norm_kernels.hip.cpp index d74c9ab8a35..5925d7c6d9a 100644 --- a/hip/stop/residual_norm_kernels.hip.cpp +++ b/hip/stop/residual_norm_kernels.hip.cpp @@ -102,15 +102,15 @@ void residual_norm(std::shared_ptr exec, { static_assert(is_complex_s::value == false, "ValueType must not be complex in this function!"); - hipLaunchKernelGGL((init_kernel), dim3(1), dim3(1), 0, 0, + hipLaunchKernelGGL((init_kernel), 1, 1, 0, 0, as_hip_type(device_storage->get_data())); - const dim3 block_size(default_block_size, 1, 1); - const dim3 grid_size(ceildiv(tau->get_size()[1], block_size.x), 1, 1); + const auto block_size = default_block_size; + const auto grid_size = ceildiv(tau->get_size()[1], block_size); - hipLaunchKernelGGL((residual_norm_kernel), dim3(grid_size), - dim3(block_size), 0, 0, tau->get_size()[1], - rel_residual_goal, as_hip_type(tau->get_const_values()), + hipLaunchKernelGGL((residual_norm_kernel), grid_size, block_size, 0, 0, + tau->get_size()[1], rel_residual_goal, + as_hip_type(tau->get_const_values()), as_hip_type(orig_tau->get_const_values()), stoppingId, setFinalized, as_hip_type(stop_status->get_data()), as_hip_type(device_storage->get_data())); @@ -180,15 +180,15 @@ void implicit_residual_norm( bool setFinalized, Array* stop_status, Array* device_storage, bool* all_converged, bool* one_changed) { - hipLaunchKernelGGL((init_kernel), dim3(1), dim3(1), 0, 0, + hipLaunchKernelGGL((init_kernel), 1, 1, 0, 0, as_hip_type(device_storage->get_data())); - const dim3 block_size(default_block_size, 1, 1); - const dim3 grid_size(ceildiv(tau->get_size()[1], block_size.x), 1, 1); + const auto block_size = default_block_size; + const auto grid_size = ceildiv(tau->get_size()[1], block_size); - hipLaunchKernelGGL((implicit_residual_norm_kernel), dim3(grid_size), - dim3(block_size), 0, 0, tau->get_size()[1], - rel_residual_goal, as_hip_type(tau->get_const_values()), + hipLaunchKernelGGL((implicit_residual_norm_kernel), grid_size, block_size, + 0, 0, tau->get_size()[1], rel_residual_goal, + as_hip_type(tau->get_const_values()), as_hip_type(orig_tau->get_const_values()), stoppingId, setFinalized, as_hip_type(stop_status->get_data()), as_hip_type(device_storage->get_data())); diff --git a/hip/test/base/hip_executor.hip.cpp b/hip/test/base/hip_executor.hip.cpp index 1b84f1fc448..693c73b2361 100644 --- a/hip/test/base/hip_executor.hip.cpp +++ b/hip/test/base/hip_executor.hip.cpp @@ -186,7 +186,7 @@ TEST_F(HipExecutor, CopiesDataToHip) hip->copy_from(omp.get(), 2, orig, copy); - hipLaunchKernelGGL((check_data), dim3(1), dim3(1), 0, 0, copy); + hipLaunchKernelGGL((check_data), 1, 1, 0, 0, copy); ASSERT_NO_THROW(hip->synchronize()); hip->free(copy); } @@ -235,7 +235,7 @@ TEST_F(HipExecutor, CopiesDataFromHip) { int copy[2]; auto orig = hip->alloc(2); - hipLaunchKernelGGL((init_data), dim3(1), dim3(1), 0, 0, orig); + hipLaunchKernelGGL((init_data), 1, 1, 0, 0, orig); omp->copy_from(hip.get(), 2, orig, copy); @@ -277,7 +277,7 @@ TEST_F(HipExecutor, CopiesDataFromHipToHip) int copy[2]; auto orig = hip->alloc(2); GKO_ASSERT_NO_HIP_ERRORS(hipSetDevice(0)); - hipLaunchKernelGGL((init_data), dim3(1), dim3(1), 0, 0, orig); + hipLaunchKernelGGL((init_data), 1, 1, 0, 0, orig); auto copy_hip2 = hip2->alloc(2); hip2->copy_from(hip.get(), 2, orig, copy_hip2); @@ -285,7 +285,7 @@ TEST_F(HipExecutor, CopiesDataFromHipToHip) // Check that the data is really on GPU2 and ensure we did not cheat int value = -1; GKO_ASSERT_NO_HIP_ERRORS(hipSetDevice(hip2->get_device_id())); - hipLaunchKernelGGL((check_data), dim3(1), dim3(1), 0, 0, copy_hip2); + hipLaunchKernelGGL((check_data), 1, 1, 0, 0, copy_hip2); GKO_ASSERT_NO_HIP_ERRORS(hipSetDevice(0)); hip2->run(ExampleOperation(value)); ASSERT_EQ(value, hip2->get_device_id()); diff --git a/hip/test/base/math.hip.cpp b/hip/test/base/math.hip.cpp index ccbc8028f27..bfeb19bd08e 100644 --- a/hip/test/base/math.hip.cpp +++ b/hip/test/base/math.hip.cpp @@ -127,8 +127,8 @@ class IsFinite : public ::testing::Test { bool test_real_is_finite_kernel() { gko::Array result(hip, 1); - hipLaunchKernelGGL(HIP_KERNEL_NAME(test_real_is_finite), dim3(1), - dim3(1), 0, 0, result.get_data()); + hipLaunchKernelGGL(HIP_KERNEL_NAME(test_real_is_finite), 1, 1, 0, 0, + result.get_data()); result.set_executor(ref); return *result.get_data(); } @@ -137,8 +137,8 @@ class IsFinite : public ::testing::Test { bool test_complex_is_finite_kernel() { gko::Array result(hip, 1); - hipLaunchKernelGGL(HIP_KERNEL_NAME(test_complex_is_finite), dim3(1), - dim3(1), 0, 0, result.get_data()); + hipLaunchKernelGGL(HIP_KERNEL_NAME(test_complex_is_finite), 1, 1, 0, + 0, result.get_data()); result.set_executor(ref); return *result.get_data(); } diff --git a/hip/test/components/cooperative_groups.hip.cpp b/hip/test/components/cooperative_groups.hip.cpp index a40357eb922..d1d46eab750 100644 --- a/hip/test/components/cooperative_groups.hip.cpp +++ b/hip/test/components/cooperative_groups.hip.cpp @@ -75,8 +75,8 @@ class CooperativeGroups : public ::testing::Test { template void test(Kernel kernel) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), dim3(1), - dim3(config::warp_size), 0, 0, dresult.get_data()); + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), 1, config::warp_size, 0, 0, + dresult.get_data()); result = dresult; auto success = *result.get_const_data(); @@ -86,9 +86,8 @@ class CooperativeGroups : public ::testing::Test { template void test_subwarp(Kernel kernel) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), dim3(1), - dim3(config::warp_size / 2), 0, 0, - dresult.get_data()); + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), 1, config::warp_size / 2, 0, + 0, dresult.get_data()); result = dresult; auto success = *result.get_const_data(); @@ -307,8 +306,8 @@ TEST_F(CooperativeGroups, ShuffleSumDouble) } dvalue = value; - hipLaunchKernelGGL(HIP_KERNEL_NAME(cg_shuffle_sum), dim3(1), - dim3(config::warp_size), 0, 0, num, dvalue.get_data()); + hipLaunchKernelGGL(HIP_KERNEL_NAME(cg_shuffle_sum), 1, + config::warp_size, 0, 0, num, dvalue.get_data()); value = dvalue; GKO_ASSERT_ARRAY_EQ(value, answer); @@ -332,7 +331,7 @@ TEST_F(CooperativeGroups, ShuffleSumComplexDouble) dvalue = value; hipLaunchKernelGGL(HIP_KERNEL_NAME(cg_shuffle_sum>), - dim3(1), dim3(config::warp_size), 0, 0, num, + 1, config::warp_size, 0, 0, num, as_hip_type(dvalue.get_data())); value = dvalue; diff --git a/hip/test/components/merging.hip.cpp b/hip/test/components/merging.hip.cpp index 8c0a3cc707c..e75423697cd 100644 --- a/hip/test/components/merging.hip.cpp +++ b/hip/test/components/merging.hip.cpp @@ -168,10 +168,9 @@ TEST_F(Merging, MergeStep) { for (int i = 0; i < rng_runs; ++i) { init_data(i); - hipLaunchKernelGGL(HIP_KERNEL_NAME(test_merge_step), dim3(1), - dim3(config::warp_size), 0, 0, - ddata1.get_const_data(), ddata2.get_const_data(), - doutdata.get_data()); + hipLaunchKernelGGL(HIP_KERNEL_NAME(test_merge_step), 1, + config::warp_size, 0, 0, ddata1.get_const_data(), + ddata2.get_const_data(), doutdata.get_data()); assert_eq_ref(config::warp_size, config::warp_size); } @@ -197,10 +196,10 @@ TEST_F(Merging, FullMerge) for (int i = 0; i < rng_runs; ++i) { init_data(i); for (auto size : sizes) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(test_merge), dim3(1), - dim3(config::warp_size), 0, 0, - ddata1.get_const_data(), ddata2.get_const_data(), - size, doutdata.get_data()); + hipLaunchKernelGGL(HIP_KERNEL_NAME(test_merge), 1, + config::warp_size, 0, 0, ddata1.get_const_data(), + ddata2.get_const_data(), size, + doutdata.get_data()); assert_eq_ref(size, 2 * size); } @@ -224,8 +223,8 @@ TEST_F(Merging, SequentialFullMerge) for (int i = 0; i < rng_runs; ++i) { init_data(i); for (auto size : sizes) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(test_sequential_merge), dim3(1), - dim3(1), 0, 0, ddata1.get_const_data(), + hipLaunchKernelGGL(HIP_KERNEL_NAME(test_sequential_merge), 1, 1, 0, + 0, ddata1.get_const_data(), ddata2.get_const_data(), size, doutdata.get_data()); @@ -270,13 +269,12 @@ TEST_F(Merging, FullMergeIdxs) for (int i = 0; i < rng_runs; ++i) { init_data(i); for (auto size : sizes) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(test_merge_idxs), dim3(1), - dim3(config::warp_size), 0, 0, - ddata1.get_const_data(), ddata2.get_const_data(), - size, doutdata.get_data(), didxs1.get_data(), - didxs2.get_data(), didxs3.get_data(), - drefidxs1.get_data(), drefidxs2.get_data(), - drefidxs3.get_data()); + hipLaunchKernelGGL( + HIP_KERNEL_NAME(test_merge_idxs), 1, config::warp_size, 0, 0, + ddata1.get_const_data(), ddata2.get_const_data(), size, + doutdata.get_data(), didxs1.get_data(), didxs2.get_data(), + didxs3.get_data(), drefidxs1.get_data(), drefidxs2.get_data(), + drefidxs3.get_data()); assert_eq_ref(size, 2 * size); idxs1 = didxs1; diff --git a/hip/test/components/searching.hip.cpp b/hip/test/components/searching.hip.cpp index 790245bbe56..10958feeb01 100644 --- a/hip/test/components/searching.hip.cpp +++ b/hip/test/components/searching.hip.cpp @@ -79,9 +79,9 @@ class Searching : public ::testing::Test { { *result.get_data() = true; dresult = result; - hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), dim3(num_blocks), - dim3(config::warp_size), 0, 0, dresult.get_data(), - offset, size); + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), num_blocks, + config::warp_size, 0, 0, dresult.get_data(), offset, + size); result = dresult; auto success = *result.get_const_data(); diff --git a/hip/test/components/sorting.hip.cpp b/hip/test/components/sorting.hip.cpp index b7251694a1f..69bc0532954 100644 --- a/hip/test/components/sorting.hip.cpp +++ b/hip/test/components/sorting.hip.cpp @@ -120,8 +120,8 @@ class Sorting : public ::testing::Test { TEST_F(Sorting, HipBitonicSortWarp) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(test_sort_warp), dim3(1), - dim3(config::warp_size), 0, 0, ddata.get_data()); + hipLaunchKernelGGL(HIP_KERNEL_NAME(test_sort_warp), 1, config::warp_size, 0, + 0, ddata.get_data()); ddata.set_executor(ref); auto data_ptr = ddata.get_const_data(); auto ref_ptr = ref_warp.get_const_data(); @@ -133,8 +133,8 @@ TEST_F(Sorting, HipBitonicSortWarp) TEST_F(Sorting, HipBitonicSortShared) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(test_sort_shared), dim3(1), - dim3(num_threads), 0, 0, ddata.get_data()); + hipLaunchKernelGGL(HIP_KERNEL_NAME(test_sort_shared), 1, num_threads, 0, 0, + ddata.get_data()); ddata.set_executor(ref); auto data_ptr = ddata.get_const_data(); auto ref_ptr = ref_shared.get_const_data(); From f732c3cfd1543d50e338d5d216bcf63021ea9e44 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 10 Dec 2021 11:58:01 +0100 Subject: [PATCH 25/32] add conversions to Fbcsr --- core/device_hooks/common_kernels.inc.cpp | 4 ++ core/matrix/csr.cpp | 25 ++++++++ core/matrix/csr_kernels.hpp | 9 +++ core/matrix/dense.cpp | 54 ++++++++++++++++ core/matrix/dense_kernels.hpp | 15 +++++ cuda/matrix/csr_kernels.cu | 10 +++ cuda/matrix/dense_kernels.cu | 20 ++++++ dpcpp/matrix/csr_kernels.dp.cpp | 10 +++ dpcpp/matrix/dense_kernels.dp.cpp | 20 ++++++ hip/matrix/csr_kernels.hip.cpp | 10 +++ hip/matrix/dense_kernels.hip.cpp | 20 ++++++ include/ginkgo/core/matrix/csr.hpp | 5 ++ include/ginkgo/core/matrix/dense.hpp | 13 ++++ include/ginkgo/core/matrix/fbcsr.hpp | 1 + omp/matrix/csr_kernels.cpp | 70 ++++++++++++++++++++ omp/matrix/dense_kernels.cpp | 81 ++++++++++++++++++++++++ reference/matrix/csr_kernels.cpp | 70 ++++++++++++++++++++ reference/matrix/dense_kernels.cpp | 79 +++++++++++++++++++++++ 18 files changed, 516 insertions(+) diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index 9a072fa56d7..1ac4b0e4eb6 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -260,6 +260,7 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_CSR_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_ELL_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_FBCSR_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_HYBRID_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_SELLP_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_SPARSITY_CSR_KERNEL); @@ -268,6 +269,8 @@ GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_SLICE_SETS_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL); GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL_SIZE_T); +GKO_STUB_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_COUNT_NONZERO_BLOCKS_PER_ROW_KERNEL); GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL); GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_SYMM_PERMUTE_KERNEL); @@ -471,6 +474,7 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_FILL_IN_MATRIX_DATA_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CONVERT_TO_ELL_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CONVERT_TO_SELLP_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_TRANSPOSE_KERNEL); diff --git a/core/matrix/csr.cpp b/core/matrix/csr.cpp index f6e3794ccf9..112922115ee 100644 --- a/core/matrix/csr.cpp +++ b/core/matrix/csr.cpp @@ -42,6 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include @@ -77,6 +78,7 @@ GKO_REGISTER_OPERATION(compute_slice_sets, sellp::compute_slice_sets); GKO_REGISTER_OPERATION(convert_to_sellp, csr::convert_to_sellp); GKO_REGISTER_OPERATION(compute_max_row_nnz, ell::compute_max_row_nnz); GKO_REGISTER_OPERATION(convert_to_ell, csr::convert_to_ell); +GKO_REGISTER_OPERATION(convert_to_fbcsr, csr::convert_to_fbcsr); GKO_REGISTER_OPERATION(compute_hybrid_coo_row_ptrs, hybrid::compute_coo_row_ptrs); GKO_REGISTER_OPERATION(convert_to_hybrid, csr::convert_to_hybrid); @@ -334,6 +336,29 @@ void Csr::move_to(Ell* result) } +template +void Csr::convert_to( + Fbcsr* result) const +{ + auto exec = this->get_executor(); + const auto bs = result->get_block_size(); + const auto row_blocks = detail::get_num_blocks(bs, this->get_size()[0]); + const auto col_blocks = detail::get_num_blocks(bs, this->get_size()[1]); + auto tmp = make_temporary_clone(exec, result); + tmp->row_ptrs_.resize_and_reset(row_blocks + 1); + tmp->set_size(this->get_size()); + exec->run(csr::make_convert_to_fbcsr(this, bs, tmp->row_ptrs_, + tmp->col_idxs_, tmp->values_)); +} + + +template +void Csr::move_to(Fbcsr* result) +{ + this->convert_to(result); +} + + template void Csr::read(const mat_data& data) { diff --git a/core/matrix/csr_kernels.hpp b/core/matrix/csr_kernels.hpp index 15bab04223a..227b7219d4c 100644 --- a/core/matrix/csr_kernels.hpp +++ b/core/matrix/csr_kernels.hpp @@ -107,6 +107,13 @@ namespace kernels { const matrix::Csr* source, \ matrix::Ell* result) +#define GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL(ValueType, IndexType) \ + void convert_to_fbcsr(std::shared_ptr exec, \ + const matrix::Csr* source, \ + int block_size, Array& row_ptrs, \ + Array& col_idxs, \ + Array& values) + #define GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL(ValueType, IndexType) \ void convert_to_hybrid(std::shared_ptr exec, \ const matrix::Csr* source, \ @@ -222,6 +229,8 @@ namespace kernels { template \ GKO_DECLARE_CSR_CONVERT_TO_ELL_KERNEL(ValueType, IndexType); \ template \ + GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL(ValueType, IndexType); \ + template \ GKO_DECLARE_CSR_TRANSPOSE_KERNEL(ValueType, IndexType); \ template \ GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL(ValueType, IndexType); \ diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp index 3f6fe48877c..93ec5578700 100644 --- a/core/matrix/dense.cpp +++ b/core/matrix/dense.cpp @@ -48,6 +48,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include @@ -82,6 +83,8 @@ GKO_REGISTER_OPERATION(compute_max_nnz_per_row, dense::compute_max_nnz_per_row); GKO_REGISTER_OPERATION(compute_hybrid_coo_row_ptrs, hybrid::compute_coo_row_ptrs); GKO_REGISTER_OPERATION(count_nonzeros_per_row, dense::count_nonzeros_per_row); +GKO_REGISTER_OPERATION(count_nonzero_blocks_per_row, + dense::count_nonzero_blocks_per_row); GKO_REGISTER_OPERATION(prefix_sum, components::prefix_sum); GKO_REGISTER_OPERATION(compute_slice_sets, dense::compute_slice_sets); GKO_REGISTER_OPERATION(transpose, dense::transpose); @@ -96,6 +99,7 @@ GKO_REGISTER_OPERATION(fill_in_matrix_data, dense::fill_in_matrix_data); GKO_REGISTER_OPERATION(convert_to_coo, dense::convert_to_coo); GKO_REGISTER_OPERATION(convert_to_csr, dense::convert_to_csr); GKO_REGISTER_OPERATION(convert_to_ell, dense::convert_to_ell); +GKO_REGISTER_OPERATION(convert_to_fbcsr, dense::convert_to_fbcsr); GKO_REGISTER_OPERATION(convert_to_hybrid, dense::convert_to_hybrid); GKO_REGISTER_OPERATION(convert_to_sellp, dense::convert_to_sellp); GKO_REGISTER_OPERATION(convert_to_sparsity_csr, dense::convert_to_sparsity_csr); @@ -456,6 +460,56 @@ void Dense::move_to(Csr* result) } +template +template +void Dense::convert_impl(Fbcsr* result) const +{ + auto exec = this->get_executor(); + const auto bs = result->get_block_size(); + const auto row_blocks = detail::get_num_blocks(bs, this->get_size()[0]); + const auto col_blocks = detail::get_num_blocks(bs, this->get_size()[1]); + auto tmp = make_temporary_clone(exec, result); + tmp->row_ptrs_.resize_and_reset(row_blocks + 1); + exec->run(dense::make_count_nonzero_blocks_per_row(this, bs, + tmp->get_row_ptrs())); + exec->run(dense::make_prefix_sum(tmp->get_row_ptrs(), row_blocks + 1)); + const auto nnz_blocks = + exec->copy_val_to_host(tmp->get_const_row_ptrs() + row_blocks); + tmp->col_idxs_.resize_and_reset(nnz_blocks); + tmp->values_.resize_and_reset(nnz_blocks * bs * bs); + tmp->set_size(this->get_size()); + exec->run(dense::make_convert_to_fbcsr(this, tmp.get())); +} + + +template +void Dense::convert_to(Fbcsr* result) const +{ + this->convert_impl(result); +} + + +template +void Dense::move_to(Fbcsr* result) +{ + this->convert_to(result); +} + + +template +void Dense::convert_to(Fbcsr* result) const +{ + this->convert_impl(result); +} + + +template +void Dense::move_to(Fbcsr* result) +{ + this->convert_to(result); +} + + template template void Dense::convert_impl(Ell* result) const diff --git a/core/matrix/dense_kernels.hpp b/core/matrix/dense_kernels.hpp index 6b37b44dc71..04913c6e72c 100644 --- a/core/matrix/dense_kernels.hpp +++ b/core/matrix/dense_kernels.hpp @@ -148,6 +148,11 @@ namespace kernels { const matrix::Dense<_type>* source, \ matrix::Ell<_type, _prec>* other) +#define GKO_DECLARE_DENSE_CONVERT_TO_FBCSR_KERNEL(_type, _prec) \ + void convert_to_fbcsr(std::shared_ptr exec, \ + const matrix::Dense<_type>* source, \ + matrix::Fbcsr<_type, _prec>* other) + #define GKO_DECLARE_DENSE_CONVERT_TO_HYBRID_KERNEL(_type, _prec) \ void convert_to_hybrid(std::shared_ptr exec, \ const matrix::Dense<_type>* source, \ @@ -180,6 +185,11 @@ namespace kernels { const matrix::Dense<_vtype>* source, \ _itype* result) +#define GKO_DECLARE_DENSE_COUNT_NONZERO_BLOCKS_PER_ROW_KERNEL(_vtype, _itype) \ + void count_nonzero_blocks_per_row( \ + std::shared_ptr exec, \ + const matrix::Dense<_vtype>* source, int block_size, _itype* result) + #define GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL_SIZE_T(_type) \ GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL(_type, ::gko::size_type) @@ -298,6 +308,8 @@ namespace kernels { template \ GKO_DECLARE_DENSE_CONVERT_TO_ELL_KERNEL(ValueType, IndexType); \ template \ + GKO_DECLARE_DENSE_CONVERT_TO_FBCSR_KERNEL(ValueType, IndexType); \ + template \ GKO_DECLARE_DENSE_CONVERT_TO_HYBRID_KERNEL(ValueType, IndexType); \ template \ GKO_DECLARE_DENSE_CONVERT_TO_SELLP_KERNEL(ValueType, IndexType); \ @@ -309,6 +321,9 @@ namespace kernels { GKO_DECLARE_DENSE_COMPUTE_SLICE_SETS_KERNEL(ValueType); \ template \ GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_DENSE_COUNT_NONZERO_BLOCKS_PER_ROW_KERNEL(ValueType, \ + IndexType); \ template \ GKO_DECLARE_DENSE_TRANSPOSE_KERNEL(ValueType); \ template \ diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.cu index e04cac7f51e..0984e934625 100644 --- a/cuda/matrix/csr_kernels.cu +++ b/cuda/matrix/csr_kernels.cu @@ -899,6 +899,16 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL); +template +void convert_to_fbcsr(std::shared_ptr exec, + const matrix::Csr* source, int bs, + Array& row_ptrs, Array& col_idxs, + Array& values) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL); + + template void transpose(std::shared_ptr exec, const matrix::Csr* orig, diff --git a/cuda/matrix/dense_kernels.cu b/cuda/matrix/dense_kernels.cu index 01c9d54c03f..c0d6e89ba79 100644 --- a/cuda/matrix/dense_kernels.cu +++ b/cuda/matrix/dense_kernels.cu @@ -215,6 +215,26 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DENSE_CONVERT_TO_ELL_KERNEL); +template +void convert_to_fbcsr(std::shared_ptr exec, + const matrix::Dense* source, + matrix::Fbcsr* result) + GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_CONVERT_TO_FBCSR_KERNEL); + + +template +void count_nonzero_blocks_per_row(std::shared_ptr exec, + const matrix::Dense* source, + int bs, + IndexType* result) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_COUNT_NONZERO_BLOCKS_PER_ROW_KERNEL); + + template void convert_to_hybrid(std::shared_ptr exec, const matrix::Dense* source, diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp index 5de9c242217..7f691fa1bfa 100644 --- a/dpcpp/matrix/csr_kernels.dp.cpp +++ b/dpcpp/matrix/csr_kernels.dp.cpp @@ -1955,6 +1955,16 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL); +template +void convert_to_fbcsr(std::shared_ptr exec, + const matrix::Csr* source, int bs, + Array& row_ptrs, Array& col_idxs, + Array& values) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL); + + template void generic_transpose(std::shared_ptr exec, const matrix::Csr* orig, diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp index efc750d1424..50477c326aa 100644 --- a/dpcpp/matrix/dense_kernels.dp.cpp +++ b/dpcpp/matrix/dense_kernels.dp.cpp @@ -638,6 +638,26 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DENSE_CONVERT_TO_ELL_KERNEL); +template +void convert_to_fbcsr(std::shared_ptr exec, + const matrix::Dense* source, + matrix::Fbcsr* result) + GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_CONVERT_TO_FBCSR_KERNEL); + + +template +void count_nonzero_blocks_per_row(std::shared_ptr exec, + const matrix::Dense* source, + int bs, + IndexType* result) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_COUNT_NONZERO_BLOCKS_PER_ROW_KERNEL); + + template void convert_to_hybrid(std::shared_ptr exec, const matrix::Dense* source, diff --git a/hip/matrix/csr_kernels.hip.cpp b/hip/matrix/csr_kernels.hip.cpp index be63a3f2ff1..cf1f7eea2a7 100644 --- a/hip/matrix/csr_kernels.hip.cpp +++ b/hip/matrix/csr_kernels.hip.cpp @@ -721,6 +721,16 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL); +template +void convert_to_fbcsr(std::shared_ptr exec, + const matrix::Csr* source, int bs, + Array& row_ptrs, Array& col_idxs, + Array& values) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL); + + template void transpose(std::shared_ptr exec, const matrix::Csr* orig, diff --git a/hip/matrix/dense_kernels.hip.cpp b/hip/matrix/dense_kernels.hip.cpp index 6e7dd34bfbc..7386b38f692 100644 --- a/hip/matrix/dense_kernels.hip.cpp +++ b/hip/matrix/dense_kernels.hip.cpp @@ -220,6 +220,26 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DENSE_CONVERT_TO_ELL_KERNEL); +template +void convert_to_fbcsr(std::shared_ptr exec, + const matrix::Dense* source, + matrix::Fbcsr* result) + GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_CONVERT_TO_FBCSR_KERNEL); + + +template +void count_nonzero_blocks_per_row(std::shared_ptr exec, + const matrix::Dense* source, + int bs, + IndexType* result) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_COUNT_NONZERO_BLOCKS_PER_ROW_KERNEL); + + template void convert_to_hybrid(std::shared_ptr exec, const matrix::Dense* source, diff --git a/include/ginkgo/core/matrix/csr.hpp b/include/ginkgo/core/matrix/csr.hpp index 40bd0ec418a..1f60334309c 100644 --- a/include/ginkgo/core/matrix/csr.hpp +++ b/include/ginkgo/core/matrix/csr.hpp @@ -129,6 +129,7 @@ class Csr : public EnableLinOp>, public ConvertibleTo>, public ConvertibleTo>, public ConvertibleTo>, + public ConvertibleTo>, public ConvertibleTo>, public ConvertibleTo>, public ConvertibleTo>, @@ -734,6 +735,10 @@ class Csr : public EnableLinOp>, void move_to(Ell* result) override; + void convert_to(Fbcsr* result) const override; + + void move_to(Fbcsr* result) override; + void convert_to(Hybrid* result) const override; void move_to(Hybrid* result) override; diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp index c36e62b9a87..d9545099d84 100644 --- a/include/ginkgo/core/matrix/dense.hpp +++ b/include/ginkgo/core/matrix/dense.hpp @@ -102,6 +102,8 @@ class Dense public ConvertibleTo>, public ConvertibleTo>, public ConvertibleTo>, + public ConvertibleTo>, + public ConvertibleTo>, public ConvertibleTo>, public ConvertibleTo>, public ConvertibleTo>, @@ -237,6 +239,14 @@ class Dense void move_to(Ell* result) override; + void convert_to(Fbcsr* result) const override; + + void move_to(Fbcsr* result) override; + + void convert_to(Fbcsr* result) const override; + + void move_to(Fbcsr* result) override; + void convert_to(Hybrid* result) const override; void move_to(Hybrid* result) override; @@ -939,6 +949,9 @@ class Dense template void convert_impl(Ell* result) const; + template + void convert_impl(Fbcsr* result) const; + template void convert_impl(Hybrid* result) const; diff --git a/include/ginkgo/core/matrix/fbcsr.hpp b/include/ginkgo/core/matrix/fbcsr.hpp index 47d8c5039c9..a04e7462fa1 100644 --- a/include/ginkgo/core/matrix/fbcsr.hpp +++ b/include/ginkgo/core/matrix/fbcsr.hpp @@ -138,6 +138,7 @@ class Fbcsr : public EnableLinOp>, remove_complex>> { friend class EnableCreateMethod; friend class EnablePolymorphicObject; + friend class Csr; friend class Dense; friend class SparsityCsr; friend class FbcsrBuilder; diff --git a/omp/matrix/csr_kernels.cpp b/omp/matrix/csr_kernels.cpp index 6d31fae06e2..5ba21041ae4 100644 --- a/omp/matrix/csr_kernels.cpp +++ b/omp/matrix/csr_kernels.cpp @@ -577,6 +577,76 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL); +template +void convert_to_fbcsr(std::shared_ptr exec, + const matrix::Csr* source, int bs, + Array& row_ptrs, Array& col_idxs, + Array& values) +{ + using entry = matrix_data_entry; + const auto num_rows = source->get_size()[0]; + const auto num_cols = source->get_size()[1]; + const auto num_block_rows = num_rows / bs; + const auto num_block_cols = num_cols / bs; + const auto in_row_ptrs = source->get_const_row_ptrs(); + const auto in_cols = source->get_const_col_idxs(); + const auto in_vals = source->get_const_values(); + const auto nnz = source->get_num_stored_elements(); + auto out_row_ptrs = row_ptrs.get_data(); + Array entry_array{exec, nnz}; + auto entries = entry_array.get_data(); + for (IndexType row = 0; row < num_rows; row++) { + for (auto nz = in_row_ptrs[row]; nz < in_row_ptrs[row + 1]; nz++) { + entries[nz] = {row, in_cols[nz], in_vals[nz]}; + } + } + auto to_block = [bs](entry a) { + return std::make_pair(a.row / bs, a.column / bs); + }; + // sort by block in row-major order + std::sort(entries, entries + nnz, + [&](entry a, entry b) { return to_block(a) < to_block(b); }); + // set row pointers by jumps in block row index + gko::vector col_idx_vec{{exec}}; + gko::vector value_vec{{exec}}; + int64 block_row = -1; + int64 block_col = -1; + for (size_type i = 0; i < nnz; i++) { + const auto entry = entries[i]; + const auto new_block_row = entry.row / bs; + const auto new_block_col = entry.column / bs; + while (new_block_row > block_row) { + // we finished row block_row, so store its end pointer + out_row_ptrs[block_row + 1] = col_idx_vec.size(); + block_col = -1; + ++block_row; + } + if (new_block_col != block_col) { + // we encountered a new column, so insert it with block storage + col_idx_vec.emplace_back(new_block_col); + value_vec.resize(value_vec.size() + bs * bs); + block_col = new_block_col; + } + const auto local_row = entry.row % bs; + const auto local_col = entry.column % bs; + value_vec[value_vec.size() - bs * bs + local_row + local_col * bs] = + entry.value; + } + while (block_row < static_cast(row_ptrs.get_num_elems() - 1)) { + // we finished row block_row, so store its end pointer + out_row_ptrs[block_row + 1] = col_idx_vec.size(); + ++block_row; + } + values.resize_and_reset(value_vec.size()); + col_idxs.resize_and_reset(col_idx_vec.size()); + std::copy(value_vec.begin(), value_vec.end(), values.get_data()); + std::copy(col_idx_vec.begin(), col_idx_vec.end(), col_idxs.get_data()); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL); + + template inline void convert_csr_to_csc(size_type num_rows, const IndexType* row_ptrs, const IndexType* col_idxs, diff --git a/omp/matrix/dense_kernels.cpp b/omp/matrix/dense_kernels.cpp index 431d610b7ca..360f05448ee 100644 --- a/omp/matrix/dense_kernels.cpp +++ b/omp/matrix/dense_kernels.cpp @@ -46,11 +46,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include +#include "accessor/block_col_major.hpp" +#include "accessor/range.hpp" #include "core/components/prefix_sum_kernels.hpp" @@ -224,6 +227,53 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DENSE_CONVERT_TO_ELL_KERNEL); +template +void convert_to_fbcsr(std::shared_ptr exec, + const matrix::Dense* source, + matrix::Fbcsr* result) +{ + const auto num_rows = source->get_size()[0]; + const auto num_cols = source->get_size()[1]; + const auto bs = result->get_block_size(); + const auto nzbs = result->get_num_stored_blocks(); + const auto num_block_rows = num_rows / bs; + const auto num_block_cols = num_cols / bs; + acc::range> blocks( + std::array{nzbs, static_cast(bs), + static_cast(bs)}, + result->get_values()); + auto col_idxs = result->get_col_idxs(); +#pragma omp parallel for + for (size_type brow = 0; brow < num_block_rows; ++brow) { + auto block = result->get_const_row_ptrs()[brow]; + for (size_type bcol = 0; bcol < num_block_cols; ++bcol) { + bool block_nz = false; + for (int lrow = 0; lrow < bs; ++lrow) { + for (int lcol = 0; lcol < bs; ++lcol) { + const auto row = lrow + bs * brow; + const auto col = lcol + bs * bcol; + block_nz = block_nz || is_nonzero(source->at(row, col)); + } + } + if (block_nz) { + col_idxs[block] = bcol; + for (int lrow = 0; lrow < bs; ++lrow) { + for (int lcol = 0; lcol < bs; ++lcol) { + const auto row = lrow + bs * brow; + const auto col = lcol + bs * bcol; + blocks(block, lrow, lcol) = source->at(row, col); + } + } + block++; + } + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_CONVERT_TO_FBCSR_KERNEL); + + template void convert_to_hybrid(std::shared_ptr exec, const matrix::Dense* source, @@ -375,6 +425,37 @@ void conj_transpose(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL); +template +void count_nonzero_blocks_per_row(std::shared_ptr exec, + const matrix::Dense* source, + int bs, IndexType* result) +{ + const auto num_rows = source->get_size()[0]; + const auto num_cols = source->get_size()[1]; + const auto num_block_rows = num_rows / bs; + const auto num_block_cols = num_cols / bs; +#pragma omp parallel for + for (size_type brow = 0; brow < num_block_rows; ++brow) { + IndexType num_nonzero_blocks{}; + for (size_type bcol = 0; bcol < num_block_cols; ++bcol) { + bool block_nz = false; + for (int lrow = 0; lrow < bs; ++lrow) { + for (int lcol = 0; lcol < bs; ++lcol) { + const auto row = lrow + bs * brow; + const auto col = lcol + bs * bcol; + block_nz = block_nz || is_nonzero(source->at(row, col)); + } + } + num_nonzero_blocks += block_nz ? 1 : 0; + } + result[brow] = num_nonzero_blocks; + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_COUNT_NONZERO_BLOCKS_PER_ROW_KERNEL); + + } // namespace dense } // namespace omp } // namespace kernels diff --git a/reference/matrix/csr_kernels.cpp b/reference/matrix/csr_kernels.cpp index 3b72aa336af..ba96df7fe64 100644 --- a/reference/matrix/csr_kernels.cpp +++ b/reference/matrix/csr_kernels.cpp @@ -475,6 +475,76 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_CONVERT_TO_ELL_KERNEL); +template +void convert_to_fbcsr(std::shared_ptr exec, + const matrix::Csr* source, int bs, + Array& row_ptrs, Array& col_idxs, + Array& values) +{ + using entry = matrix_data_entry; + const auto num_rows = source->get_size()[0]; + const auto num_cols = source->get_size()[1]; + const auto num_block_rows = num_rows / bs; + const auto num_block_cols = num_cols / bs; + const auto in_row_ptrs = source->get_const_row_ptrs(); + const auto in_cols = source->get_const_col_idxs(); + const auto in_vals = source->get_const_values(); + const auto nnz = source->get_num_stored_elements(); + auto out_row_ptrs = row_ptrs.get_data(); + Array entry_array{exec, nnz}; + auto entries = entry_array.get_data(); + for (IndexType row = 0; row < num_rows; row++) { + for (auto nz = in_row_ptrs[row]; nz < in_row_ptrs[row + 1]; nz++) { + entries[nz] = {row, in_cols[nz], in_vals[nz]}; + } + } + auto to_block = [bs](entry a) { + return std::make_pair(a.row / bs, a.column / bs); + }; + // sort by block in row-major order + std::sort(entries, entries + nnz, + [&](entry a, entry b) { return to_block(a) < to_block(b); }); + // set row pointers by jumps in block row index + gko::vector col_idx_vec{{exec}}; + gko::vector value_vec{{exec}}; + int64 block_row = -1; + int64 block_col = -1; + for (size_type i = 0; i < nnz; i++) { + const auto entry = entries[i]; + const auto new_block_row = entry.row / bs; + const auto new_block_col = entry.column / bs; + while (new_block_row > block_row) { + // we finished row block_row, so store its end pointer + out_row_ptrs[block_row + 1] = col_idx_vec.size(); + block_col = -1; + ++block_row; + } + if (new_block_col != block_col) { + // we encountered a new column, so insert it with block storage + col_idx_vec.emplace_back(new_block_col); + value_vec.resize(value_vec.size() + bs * bs); + block_col = new_block_col; + } + const auto local_row = entry.row % bs; + const auto local_col = entry.column % bs; + value_vec[value_vec.size() - bs * bs + local_row + local_col * bs] = + entry.value; + } + while (block_row < static_cast(row_ptrs.get_num_elems() - 1)) { + // we finished row block_row, so store its end pointer + out_row_ptrs[block_row + 1] = col_idx_vec.size(); + ++block_row; + } + values.resize_and_reset(value_vec.size()); + col_idxs.resize_and_reset(col_idx_vec.size()); + std::copy(value_vec.begin(), value_vec.end(), values.get_data()); + std::copy(col_idx_vec.begin(), col_idx_vec.end(), col_idxs.get_data()); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL); + + template inline void convert_csr_to_csc(size_type num_rows, const IndexType* row_ptrs, const IndexType* col_idxs, diff --git a/reference/matrix/dense_kernels.cpp b/reference/matrix/dense_kernels.cpp index 804237e4748..cd19aa5299e 100644 --- a/reference/matrix/dense_kernels.cpp +++ b/reference/matrix/dense_kernels.cpp @@ -43,11 +43,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include +#include "accessor/block_col_major.hpp" +#include "accessor/range.hpp" #include "core/components/prefix_sum_kernels.hpp" @@ -462,6 +465,52 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DENSE_CONVERT_TO_ELL_KERNEL); +template +void convert_to_fbcsr(std::shared_ptr exec, + const matrix::Dense* source, + matrix::Fbcsr* result) +{ + const auto num_rows = source->get_size()[0]; + const auto num_cols = source->get_size()[1]; + const auto bs = result->get_block_size(); + const auto nzbs = result->get_num_stored_blocks(); + const auto num_block_rows = num_rows / bs; + const auto num_block_cols = num_cols / bs; + acc::range> blocks( + std::array{nzbs, static_cast(bs), + static_cast(bs)}, + result->get_values()); + auto col_idxs = result->get_col_idxs(); + for (size_type brow = 0; brow < num_block_rows; ++brow) { + auto block = result->get_const_row_ptrs()[brow]; + for (size_type bcol = 0; bcol < num_block_cols; ++bcol) { + bool block_nz = false; + for (int lrow = 0; lrow < bs; ++lrow) { + for (int lcol = 0; lcol < bs; ++lcol) { + const auto row = lrow + bs * brow; + const auto col = lcol + bs * bcol; + block_nz = block_nz || is_nonzero(source->at(row, col)); + } + } + if (block_nz) { + col_idxs[block] = bcol; + for (int lrow = 0; lrow < bs; ++lrow) { + for (int lcol = 0; lcol < bs; ++lcol) { + const auto row = lrow + bs * brow; + const auto col = lcol + bs * bcol; + blocks(block, lrow, lcol) = source->at(row, col); + } + } + block++; + } + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_CONVERT_TO_FBCSR_KERNEL); + + template void convert_to_hybrid(std::shared_ptr exec, const matrix::Dense* source, const int64*, @@ -658,6 +707,36 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL_SIZE_T); +template +void count_nonzero_blocks_per_row(std::shared_ptr exec, + const matrix::Dense* source, + int bs, IndexType* result) +{ + const auto num_rows = source->get_size()[0]; + const auto num_cols = source->get_size()[1]; + const auto num_block_rows = num_rows / bs; + const auto num_block_cols = num_cols / bs; + for (size_type brow = 0; brow < num_block_rows; ++brow) { + IndexType num_nonzero_blocks{}; + for (size_type bcol = 0; bcol < num_block_cols; ++bcol) { + bool block_nz = false; + for (int lrow = 0; lrow < bs; ++lrow) { + for (int lcol = 0; lcol < bs; ++lcol) { + const auto row = lrow + bs * brow; + const auto col = lcol + bs * bcol; + block_nz = block_nz || is_nonzero(source->at(row, col)); + } + } + num_nonzero_blocks += block_nz ? 1 : 0; + } + result[brow] = num_nonzero_blocks; + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_COUNT_NONZERO_BLOCKS_PER_ROW_KERNEL); + + template void transpose(std::shared_ptr exec, const matrix::Dense* orig, From f0a683df1da3c64da1c226fc715d9112f38f9dce Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 24 Jan 2022 03:51:05 +0100 Subject: [PATCH 26/32] review updates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * remove unnecessary kernel already present in format_conversion * fix include structure * fix Hybrid::resize * remove unnecessary kernel launch guards * add HIP kernel launch guards * simplify 2D kernel launch guards * add tests for format conversion components Co-authored-by: Yuhsiang Tsai Co-authored-by: Pratik Nayak Co-authored-by: Thomas Grützmacher --- common/cuda_hip/matrix/csr_kernels.hpp.inc | 3 +- .../components/format_conversion_kernels.cpp | 7 +- common/unified/matrix/csr_kernels.cpp | 17 -- core/components/format_conversion_kernels.hpp | 12 +- core/device_hooks/common_kernels.inc.cpp | 8 +- core/matrix/csr.cpp | 12 +- core/matrix/csr_kernels.hpp | 7 - core/matrix/hybrid.cpp | 1 + cuda/base/kernel_launch.cuh | 2 +- cuda/base/kernel_launch_reduction.cuh | 94 ++++------ cuda/base/kernel_launch_solver.cuh | 2 +- cuda/components/reduction.cuh | 7 +- cuda/factorization/factorization_kernels.cu | 29 +-- cuda/matrix/csr_kernels.cu | 21 +-- cuda/matrix/dense_kernels.cu | 4 +- cuda/matrix/ell_kernels.cu | 4 +- cuda/matrix/fbcsr_kernels.cu | 9 +- cuda/matrix/sellp_kernels.cu | 18 +- cuda/preconditioner/jacobi_kernels.cu | 1 + cuda/test/matrix/csr_kernels.cpp | 15 -- cuda/test/matrix/ell_kernels.cpp | 1 - dpcpp/test/matrix/csr_kernels.cpp | 15 -- dpcpp/test/matrix/ell_kernels.cpp | 10 +- hip/base/kernel_launch.hip.hpp | 2 +- hip/base/kernel_launch_reduction.hip.hpp | 27 ++- hip/base/kernel_launch_solver.hip.hpp | 2 +- .../factorization_kernels.hip.cpp | 71 ++++--- hip/factorization/par_ic_kernels.hip.cpp | 26 ++- hip/factorization/par_ict_kernels.hip.cpp | 40 ++-- hip/factorization/par_ilu_kernels.hip.cpp | 22 ++- .../par_ilut_approx_filter_kernel.hip.cpp | 21 ++- .../par_ilut_filter_kernel.hip.cpp | 22 ++- .../par_ilut_select_common.hip.cpp | 8 +- .../par_ilut_select_kernel.hip.cpp | 9 +- .../par_ilut_spgeam_kernel.hip.cpp | 32 ++-- .../par_ilut_sweep_kernel.hip.cpp | 24 +-- hip/matrix/coo_kernels.hip.cpp | 4 +- hip/matrix/csr_kernels.hip.cpp | 175 +++++++++++------- hip/matrix/dense_kernels.hip.cpp | 4 +- hip/matrix/diagonal_kernels.hip.cpp | 8 +- hip/matrix/ell_kernels.hip.cpp | 35 ++-- hip/matrix/fbcsr_kernels.hip.cpp | 15 +- hip/matrix/sellp_kernels.hip.cpp | 48 ++--- hip/multigrid/amgx_pgm_kernels.hip.cpp | 78 ++++---- hip/preconditioner/isai_kernels.hip.cpp | 102 +++++----- ...obi_advanced_apply_instantiate.inc.hip.cpp | 33 ++-- .../jacobi_generate_instantiate.inc.hip.cpp | 39 ++-- hip/preconditioner/jacobi_kernels.hip.cpp | 53 +++--- ...acobi_simple_apply_instantiate.inc.hip.cpp | 31 ++-- hip/solver/multigrid_kernels.hip.cpp | 48 ++--- hip/stop/criterion_kernels.hip.cpp | 8 +- hip/stop/residual_norm_kernels.hip.cpp | 30 +-- hip/test/matrix/csr_kernels.hip.cpp | 15 -- include/ginkgo/core/matrix/coo.hpp | 7 + include/ginkgo/core/matrix/dense.hpp | 2 + include/ginkgo/core/matrix/ell.hpp | 9 + include/ginkgo/core/matrix/hybrid.hpp | 10 + omp/test/matrix/csr_kernels.cpp | 15 -- .../components/format_conversion_kernels.cpp | 7 +- reference/matrix/csr_kernels.cpp | 15 -- reference/test/components/CMakeLists.txt | 1 + .../components/format_conversion_kernels.cpp | 128 +++++++++++++ reference/test/matrix/csr_kernels.cpp | 13 -- test/components/CMakeLists.txt | 1 + test/components/format_conversion_kernels.cpp | 168 +++++++++++++++++ 65 files changed, 1022 insertions(+), 685 deletions(-) create mode 100644 reference/test/components/format_conversion_kernels.cpp create mode 100644 test/components/format_conversion_kernels.cpp diff --git a/common/cuda_hip/matrix/csr_kernels.hpp.inc b/common/cuda_hip/matrix/csr_kernels.hpp.inc index 5f14ee9ea0c..abead3d4107 100644 --- a/common/cuda_hip/matrix/csr_kernels.hpp.inc +++ b/common/cuda_hip/matrix/csr_kernels.hpp.inc @@ -795,8 +795,7 @@ template __global__ __launch_bounds__(default_block_size) void compute_submatrix_idxs_and_vals( const size_type num_rows, const size_type num_cols, - const size_type num_nnz, const size_type row_offset, - const size_type col_offset, + const size_type row_offset, const size_type col_offset, const IndexType* __restrict__ source_row_ptrs, const IndexType* __restrict__ source_col_idxs, const ValueType* __restrict__ source_values, diff --git a/common/unified/components/format_conversion_kernels.cpp b/common/unified/components/format_conversion_kernels.cpp index 75d5837d34f..8ce0d595f16 100644 --- a/common/unified/components/format_conversion_kernels.cpp +++ b/common/unified/components/format_conversion_kernels.cpp @@ -96,10 +96,10 @@ GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CONVERT_IDXS_TO_PTRS32); GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CONVERT_IDXS_TO_PTRS64); -template +template void convert_ptrs_to_sizes(std::shared_ptr exec, const RowPtrType* ptrs, size_type num_blocks, - IndexType* sizes) + size_type* sizes) { run_kernel( exec, @@ -109,8 +109,7 @@ void convert_ptrs_to_sizes(std::shared_ptr exec, num_blocks, ptrs, sizes); } -GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CONVERT_PTRS_TO_SIZES32); -GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CONVERT_PTRS_TO_SIZES64); +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CONVERT_PTRS_TO_SIZES); } // namespace components diff --git a/common/unified/matrix/csr_kernels.cpp b/common/unified/matrix/csr_kernels.cpp index 5b571dea4ae..87acfec1663 100644 --- a/common/unified/matrix/csr_kernels.cpp +++ b/common/unified/matrix/csr_kernels.cpp @@ -152,23 +152,6 @@ void inv_scale(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_SCALE_KERNEL); -template -void count_nonzeros_per_row(std::shared_ptr exec, - const matrix::Csr* source, - size_type* result) -{ - run_kernel( - exec, - [] GKO_KERNEL(auto row, auto row_ptrs, auto nnz) { - nnz[row] = row_ptrs[row + 1] - row_ptrs[row]; - }, - source->get_size()[0], source->get_const_row_ptrs(), result); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_COUNT_NONZEROS_PER_ROW_KERNEL); - - template void convert_to_sellp(std::shared_ptr exec, const matrix::Csr* matrix, diff --git a/core/components/format_conversion_kernels.hpp b/core/components/format_conversion_kernels.hpp index 8cf17dbe5e1..193dbc33060 100644 --- a/core/components/format_conversion_kernels.hpp +++ b/core/components/format_conversion_kernels.hpp @@ -67,14 +67,10 @@ namespace kernels { #define GKO_DECLARE_CONVERT_IDXS_TO_PTRS64(IndexType) \ GKO_DECLARE_CONVERT_IDXS_TO_PTRS(IndexType, ::gko::int64) -#define GKO_DECLARE_CONVERT_PTRS_TO_SIZES(IndexType, RowPtrType) \ +#define GKO_DECLARE_CONVERT_PTRS_TO_SIZES(RowPtrType) \ void convert_ptrs_to_sizes(std::shared_ptr exec, \ const RowPtrType* ptrs, size_type num_blocks, \ - IndexType* sizes) -#define GKO_DECLARE_CONVERT_PTRS_TO_SIZES32(IndexType) \ - GKO_DECLARE_CONVERT_PTRS_TO_SIZES(IndexType, ::gko::int32) -#define GKO_DECLARE_CONVERT_PTRS_TO_SIZES64(IndexType) \ - GKO_DECLARE_CONVERT_PTRS_TO_SIZES(IndexType, ::gko::int64) + size_type* sizes) #define GKO_DECLARE_ALL_AS_TEMPLATES \ @@ -82,8 +78,8 @@ namespace kernels { GKO_DECLARE_CONVERT_PTRS_TO_IDXS(IndexType, RowPtrType); \ template \ GKO_DECLARE_CONVERT_IDXS_TO_PTRS(IndexType, RowPtrType); \ - template \ - GKO_DECLARE_CONVERT_PTRS_TO_SIZES(IndexType, RowPtrType) + template \ + GKO_DECLARE_CONVERT_PTRS_TO_SIZES(RowPtrType) GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(components, diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index 1ac4b0e4eb6..647c0c7ed12 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -202,11 +202,10 @@ GKO_DECLARE_CONVERT_IDXS_TO_PTRS(IndexType, RowPtrType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CONVERT_IDXS_TO_PTRS32); GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CONVERT_IDXS_TO_PTRS64); -template -GKO_DECLARE_CONVERT_PTRS_TO_SIZES(IndexType, RowPtrType) +template +GKO_DECLARE_CONVERT_PTRS_TO_SIZES(RowPtrType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); -GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CONVERT_PTRS_TO_SIZES32); -GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CONVERT_PTRS_TO_SIZES64); +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CONVERT_PTRS_TO_SIZES); } // namespace components @@ -484,7 +483,6 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INVERSE_COLUMN_PERMUTE_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL); GKO_STUB_INDEX_TYPE(GKO_DECLARE_INVERT_PERMUTATION_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_COUNT_NONZEROS_PER_ROW_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_EXTRACT_DIAGONAL); diff --git a/core/matrix/csr.cpp b/core/matrix/csr.cpp index 112922115ee..ed75ed45287 100644 --- a/core/matrix/csr.cpp +++ b/core/matrix/csr.cpp @@ -92,7 +92,8 @@ GKO_REGISTER_OPERATION(row_permute, csr::row_permute); GKO_REGISTER_OPERATION(inverse_row_permute, csr::inverse_row_permute); GKO_REGISTER_OPERATION(inverse_column_permute, csr::inverse_column_permute); GKO_REGISTER_OPERATION(invert_permutation, csr::invert_permutation); -GKO_REGISTER_OPERATION(count_nonzeros_per_row, csr::count_nonzeros_per_row); +GKO_REGISTER_OPERATION(convert_ptrs_to_sizes, + components::convert_ptrs_to_sizes); GKO_REGISTER_OPERATION(sort_by_column_index, csr::sort_by_column_index); GKO_REGISTER_OPERATION(is_sorted_by_column_index, csr::is_sorted_by_column_index); @@ -231,16 +232,15 @@ void Csr::convert_to( auto exec = this->get_executor(); Array row_nnz{exec, this->get_size()[0]}; Array coo_row_ptrs{exec, this->get_size()[0] + 1}; - exec->run(csr::make_count_nonzeros_per_row(this, row_nnz.get_data())); + exec->run(csr::make_convert_ptrs_to_sizes( + this->get_const_row_ptrs(), this->get_size()[0], row_nnz.get_data())); size_type ell_lim{}; size_type coo_nnz{}; result->get_strategy()->compute_hybrid_config(row_nnz, &ell_lim, &coo_nnz); exec->run(csr::make_compute_hybrid_coo_row_ptrs(row_nnz, ell_lim, coo_row_ptrs.get_data())); auto tmp = make_temporary_clone(exec, result); - tmp->ell_->resize(this->get_size(), ell_lim); - tmp->coo_->resize(this->get_size(), coo_nnz); - tmp->set_size(this->get_size()); + tmp->resize(this->get_size(), ell_lim, coo_nnz); exec->run(csr::make_convert_to_hybrid(this, coo_row_ptrs.get_const_data(), tmp.get())); } @@ -294,7 +294,7 @@ void Csr::convert_to( result->row_ptrs_ = this->row_ptrs_; if (!result->value_.get_data()) { result->value_ = - gko::Array(result->get_executor(), {one()}); + Array(result->get_executor(), {one()}); } result->set_size(this->get_size()); } diff --git a/core/matrix/csr_kernels.hpp b/core/matrix/csr_kernels.hpp index 227b7219d4c..95c3c7c9f0c 100644 --- a/core/matrix/csr_kernels.hpp +++ b/core/matrix/csr_kernels.hpp @@ -165,11 +165,6 @@ namespace kernels { std::shared_ptr exec, size_type size, \ const IndexType* permutation_indices, IndexType* inv_permutation) -#define GKO_DECLARE_CSR_COUNT_NONZEROS_PER_ROW_KERNEL(ValueType, IndexType) \ - void count_nonzeros_per_row( \ - std::shared_ptr exec, \ - const matrix::Csr* source, size_type* result) - #define GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL(ValueType, IndexType) \ void calculate_nonzeros_per_row_in_span( \ std::shared_ptr exec, \ @@ -245,8 +240,6 @@ namespace kernels { template \ GKO_DECLARE_INVERT_PERMUTATION_KERNEL(IndexType); \ template \ - GKO_DECLARE_CSR_COUNT_NONZEROS_PER_ROW_KERNEL(ValueType, IndexType); \ - template \ GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL(ValueType, IndexType); \ template \ GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL(ValueType, IndexType); \ diff --git a/core/matrix/hybrid.cpp b/core/matrix/hybrid.cpp index c4152493d3b..42cdce0f162 100644 --- a/core/matrix/hybrid.cpp +++ b/core/matrix/hybrid.cpp @@ -199,6 +199,7 @@ void Hybrid::resize(dim<2> new_size, size_type ell_row_nnz, size_type coo_nnz) { + this->set_size(new_size); ell_->resize(new_size, ell_row_nnz); coo_->resize(new_size, coo_nnz); } diff --git a/cuda/base/kernel_launch.cuh b/cuda/base/kernel_launch.cuh index c82d024791b..67f80f78a2f 100644 --- a/cuda/base/kernel_launch.cuh +++ b/cuda/base/kernel_launch.cuh @@ -90,7 +90,7 @@ template void run_kernel(std::shared_ptr exec, KernelFunction fn, dim<2> size, KernelArgs&&... args) { - if (size[0] * size[1] > 0) { + if (size[0] > 0 && size[1] > 0) { constexpr auto block_size = default_block_size; auto num_blocks = ceildiv(size[0] * size[1], block_size); generic_kernel_2d<<>>( diff --git a/cuda/base/kernel_launch_reduction.cuh b/cuda/base/kernel_launch_reduction.cuh index 9ecdfcceb7a..db7040b07f5 100644 --- a/cuda/base/kernel_launch_reduction.cuh +++ b/cuda/base/kernel_launch_reduction.cuh @@ -148,12 +148,10 @@ void run_kernel_reduction(std::shared_ptr exec, ceildiv(size, block_size), exec->get_num_warps() * oversubscription); if (num_blocks > 1) { Array partial{exec, static_cast(num_blocks)}; - if (num_blocks > 0) { - generic_kernel_reduction_1d<<>>( - static_cast(size), fn, op, - [] __device__(auto v) { return v; }, as_cuda_type(identity), - as_cuda_type(partial.get_data()), map_to_device(args)...); - } + generic_kernel_reduction_1d<<>>( + static_cast(size), fn, op, + [] __device__(auto v) { return v; }, as_cuda_type(identity), + as_cuda_type(partial.get_data()), map_to_device(args)...); generic_kernel_reduction_1d<<<1, block_size>>>( static_cast(num_blocks), [] __device__(auto i, auto v) { return v[i]; }, op, finalize, @@ -183,12 +181,10 @@ void run_kernel_reduction(std::shared_ptr exec, exec->get_num_warps() * oversubscription); if (num_blocks > 1) { Array partial{exec, static_cast(num_blocks)}; - if (num_blocks > 0) { - generic_kernel_reduction_2d<<>>( - rows, cols, fn, op, [] __device__(auto v) { return v; }, - as_cuda_type(identity), as_cuda_type(partial.get_data()), - map_to_device(args)...); - } + generic_kernel_reduction_2d<<>>( + rows, cols, fn, op, [] __device__(auto v) { return v; }, + as_cuda_type(identity), as_cuda_type(partial.get_data()), + map_to_device(args)...); generic_kernel_reduction_1d<<<1, block_size>>>( static_cast(num_blocks), [] __device__(auto i, auto v) { return v[i]; }, op, finalize, @@ -406,13 +402,11 @@ void run_generic_col_reduction_small(syn::value_list, } else { Array tmp_storage{exec, static_cast(num_blocks * cols)}; - if (num_blocks > 0) { - generic_kernel_col_reduction_2d_small - <<>>( - rows, cols, fn, op, [] __device__(auto v) { return v; }, - as_cuda_type(identity), - as_cuda_type(tmp_storage.get_data()), args...); - } + generic_kernel_col_reduction_2d_small + <<>>( + rows, cols, fn, op, [] __device__(auto v) { return v; }, + as_cuda_type(identity), as_cuda_type(tmp_storage.get_data()), + args...); if (cols > 0) { generic_kernel_reduction_finalize_2d<<< ceildiv(cols, default_block_size), default_block_size>>>( @@ -451,22 +445,18 @@ void run_kernel_row_reduction(std::shared_ptr exec, static_cast(col_blocks * rows)}; const auto num_blocks = ceildiv(rows * col_blocks * config::warp_size, default_block_size); - if (num_blocks > 0) { - generic_kernel_row_reduction_2d - <<>>( - rows, cols, col_blocks, fn, op, - [] __device__(auto v) { return v; }, as_cuda_type(identity), - as_cuda_type(partial.get_data()), 1, - map_to_device(args)...); - } + // no need to guard this kernel, as rows * cols > resources + generic_kernel_row_reduction_2d + <<>>( + rows, cols, col_blocks, fn, op, + [] __device__(auto v) { return v; }, as_cuda_type(identity), + as_cuda_type(partial.get_data()), 1, map_to_device(args)...); const auto num_finalize_blocks = ceildiv(rows, default_block_size); - if (num_finalize_blocks > 0) { - generic_kernel_reduction_finalize_2d<<>>( - rows, col_blocks, op, finalize, as_cuda_type(identity), - as_cuda_type(partial.get_const_data()), - static_cast(result_stride), as_cuda_type(result)); - } + generic_kernel_reduction_finalize_2d<<>>( + rows, col_blocks, op, finalize, as_cuda_type(identity), + as_cuda_type(partial.get_const_data()), + static_cast(result_stride), as_cuda_type(result)); } else { select_run_generic_kernel_row_reduction( subwarp_sizes(), @@ -513,30 +503,24 @@ void run_kernel_col_reduction(std::shared_ptr exec, max_blocks), col_blocks); if (row_blocks <= 1) { - if (col_blocks > 0) { - generic_kernel_col_reduction_2d_blocked<<>>( - rows, cols, fn, op, finalize, as_cuda_type(identity), - as_cuda_type(result), map_to_device(args)...); - } + generic_kernel_col_reduction_2d_blocked<<>>( + rows, cols, fn, op, finalize, as_cuda_type(identity), + as_cuda_type(result), map_to_device(args)...); } else { Array tmp_storage{ exec, static_cast(row_blocks * cols)}; - if (row_blocks * col_blocks > 0) { - generic_kernel_col_reduction_2d_blocked<<< - dim3(row_blocks, col_blocks), default_block_size>>>( - rows, cols, fn, op, [] __device__(auto v) { return v; }, - as_cuda_type(identity), - as_cuda_type(tmp_storage.get_data()), - map_to_device(args)...); - } - if (cols > 0) { - generic_kernel_reduction_finalize_2d<<< - ceildiv(cols, default_block_size), default_block_size>>>( - cols, row_blocks, op, finalize, as_cuda_type(identity), - as_cuda_type(tmp_storage.get_const_data()), 1, - as_cuda_type(result)); - } + // no need to guard this kernel, as cols > warp_size, row_blocks > 1 + generic_kernel_col_reduction_2d_blocked<<< + dim3(row_blocks, col_blocks), default_block_size>>>( + rows, cols, fn, op, [] __device__(auto v) { return v; }, + as_cuda_type(identity), as_cuda_type(tmp_storage.get_data()), + map_to_device(args)...); + generic_kernel_reduction_finalize_2d<<< + ceildiv(cols, default_block_size), default_block_size>>>( + cols, row_blocks, op, finalize, as_cuda_type(identity), + as_cuda_type(tmp_storage.get_const_data()), 1, + as_cuda_type(result)); } } } diff --git a/cuda/base/kernel_launch_solver.cuh b/cuda/base/kernel_launch_solver.cuh index 062e8091081..38dc0afffff 100644 --- a/cuda/base/kernel_launch_solver.cuh +++ b/cuda/base/kernel_launch_solver.cuh @@ -62,7 +62,7 @@ void run_kernel_solver(std::shared_ptr exec, KernelFunction fn, dim<2> size, size_type default_stride, KernelArgs&&... args) { - if (size[0] * size[1] > 0) { + if (size[0] > 0 && size[1] > 0) { constexpr auto block_size = default_block_size; auto num_blocks = ceildiv(size[0] * size[1], block_size); generic_kernel_2d_solver<<>>( diff --git a/cuda/components/reduction.cuh b/cuda/components/reduction.cuh index 892c84368d7..4d73fb736b0 100644 --- a/cuda/components/reduction.cuh +++ b/cuda/components/reduction.cuh @@ -82,11 +82,8 @@ __host__ ValueType reduce_add_array(std::shared_ptr exec, block_results.resize_and_reset(grid_dim); - if (grid_dim > 0) { - reduce_add_array<<>>( - size, as_cuda_type(source), - as_cuda_type(block_results.get_data())); - } + reduce_add_array<<>>( + size, as_cuda_type(source), as_cuda_type(block_results.get_data())); block_results_val = block_results.get_const_data(); } diff --git a/cuda/factorization/factorization_kernels.cu b/cuda/factorization/factorization_kernels.cu index 202da06749f..3ad3435e775 100644 --- a/cuda/factorization/factorization_kernels.cu +++ b/cuda/factorization/factorization_kernels.cu @@ -75,6 +75,9 @@ void add_diagonal_elements(std::shared_ptr exec, auto num_rows = static_cast(mtx_size[0]); auto num_cols = static_cast(mtx_size[1]); size_type row_ptrs_size = num_rows + 1; + if (num_rows == 0) { + return; + } Array row_ptrs_addition(exec, row_ptrs_size); Array needs_change_host{exec->get_master(), 1}; @@ -90,20 +93,18 @@ void add_diagonal_elements(std::shared_ptr exec, const auto block_dim = default_block_size; const auto grid_dim = static_cast(ceildiv(num_rows, block_dim / subwarp_size)); - if (num_rows > 0) { - if (is_sorted) { - kernel::find_missing_diagonal_elements - <<>>( - num_rows, num_cols, cuda_old_col_idxs, cuda_old_row_ptrs, - cuda_row_ptrs_add, - as_cuda_type(needs_change_device.get_data())); - } else { - kernel::find_missing_diagonal_elements - <<>>( - num_rows, num_cols, cuda_old_col_idxs, cuda_old_row_ptrs, - cuda_row_ptrs_add, - as_cuda_type(needs_change_device.get_data())); - } + if (is_sorted) { + kernel::find_missing_diagonal_elements + <<>>( + num_rows, num_cols, cuda_old_col_idxs, cuda_old_row_ptrs, + cuda_row_ptrs_add, + as_cuda_type(needs_change_device.get_data())); + } else { + kernel::find_missing_diagonal_elements + <<>>( + num_rows, num_cols, cuda_old_col_idxs, cuda_old_row_ptrs, + cuda_row_ptrs_add, + as_cuda_type(needs_change_device.get_data())); } needs_change_host = needs_change_device; if (!needs_change_host.get_const_data()[0]) { diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.cu index 0984e934625..2550aa2ceb3 100644 --- a/cuda/matrix/csr_kernels.cu +++ b/cuda/matrix/csr_kernels.cu @@ -230,7 +230,7 @@ void classical_spmv(syn::value_list, const auto block = spmv_block_size; if (alpha == nullptr && beta == nullptr) { - if (grid.x * grid.y > 0) { + if (grid.x > 0 && grid.y > 0) { kernel::abstract_classical_spmv <<>>( a->get_size()[0], as_cuda_type(a->get_const_values()), @@ -240,7 +240,7 @@ void classical_spmv(syn::value_list, as_cuda_type(c->get_values()), c->get_stride()); } } else if (alpha != nullptr && beta != nullptr) { - if (grid.x * grid.y > 0) { + if (grid.x > 0 && grid.y > 0) { kernel::abstract_classical_spmv <<>>( a->get_size()[0], as_cuda_type(alpha->get_const_values()), @@ -277,7 +277,7 @@ void load_balance_spmv(std::shared_ptr exec, const dim3 csr_block(config::warp_size, warps_in_block, 1); const dim3 csr_grid(ceildiv(nwarps, warps_in_block), b->get_size()[1]); if (alpha) { - if (csr_grid.x * csr_grid.y > 0) { + if (csr_grid.x > 0 && csr_grid.y > 0) { kernel::abstract_spmv<<>>( nwarps, static_cast(a->get_size()[0]), as_cuda_type(alpha->get_const_values()), @@ -291,7 +291,7 @@ void load_balance_spmv(std::shared_ptr exec, as_cuda_type(c->get_stride())); } } else { - if (csr_grid.x * csr_grid.y > 0) { + if (csr_grid.x > 0 && csr_grid.y > 0) { kernel::abstract_spmv<<>>( nwarps, static_cast(a->get_size()[0]), as_cuda_type(a->get_const_values()), @@ -357,7 +357,7 @@ bool try_general_sparselib_spmv(std::shared_ptr exec, cusparse::spmv_buffersize(handle, trans, alpha, mat, vecb, beta, vecc, alg, &buffer_size); - gko::Array buffer_array(exec, buffer_size); + Array buffer_array(exec, buffer_size); auto buffer = buffer_array.get_data(); cusparse::spmv(handle, trans, alpha, mat, vecb, beta, vecc, alg, buffer); @@ -374,7 +374,7 @@ bool try_general_sparselib_spmv(std::shared_ptr exec, vecb, beta, vecc, alg, &buffer_size); - gko::Array buffer_array(exec, buffer_size); + Array buffer_array(exec, buffer_size); auto buffer = buffer_array.get_data(); cusparse::spmm(handle, trans, trans, alpha, mat, vecb, beta, vecc, alg, buffer); @@ -417,7 +417,7 @@ void spmv(std::shared_ptr exec, const matrix::Csr* a, const matrix::Dense* b, matrix::Dense* c) { - if (c->get_size()[0] * c->get_size()[1] == 0) { + if (c->get_size()[0] == 0 || c->get_size()[1] == 0) { // empty output: nothing to do } else if (a->get_strategy()->get_name() == "load_balance") { host_kernel::load_balance_spmv(exec, a, b, c); @@ -475,7 +475,7 @@ void advanced_spmv(std::shared_ptr exec, const matrix::Dense* beta, matrix::Dense* c) { - if (c->get_size()[0] * c->get_size()[1] == 0) { + if (c->get_size()[0] == 0 || c->get_size()[1] == 0) { // empty output: nothing to do } else if (a->get_strategy()->get_name() == "load_balance") { host_kernel::load_balance_spmv(exec, a, b, c, alpha, beta); @@ -1141,13 +1141,10 @@ void compute_submatrix(std::shared_ptr exec, auto num_cols = result->get_size()[1]; auto row_ptrs = source->get_const_row_ptrs(); auto grid_dim = ceildiv(num_rows, default_block_size); - - auto num_nnz = source->get_num_stored_elements(); - grid_dim = ceildiv(num_nnz, default_block_size); if (grid_dim > 0) { kernel:: compute_submatrix_idxs_and_vals<<>>( - num_rows, num_cols, num_nnz, row_offset, col_offset, + num_rows, num_cols, row_offset, col_offset, as_cuda_type(source->get_const_row_ptrs()), as_cuda_type(source->get_const_col_idxs()), as_cuda_type(source->get_const_values()), diff --git a/cuda/matrix/dense_kernels.cu b/cuda/matrix/dense_kernels.cu index c0d6e89ba79..60d2c119a03 100644 --- a/cuda/matrix/dense_kernels.cu +++ b/cuda/matrix/dense_kernels.cu @@ -81,7 +81,7 @@ void simple_apply(std::shared_ptr exec, { if (cublas::is_supported::value) { auto handle = exec->get_cublas_handle(); - if (c->get_size()[0] * c->get_size()[1] > 0) { + if (c->get_size()[0] > 0 && c->get_size()[1] > 0) { if (a->get_size()[1] > 0) { cublas::pointer_mode_guard pm_guard(handle); auto alpha = one(); @@ -110,7 +110,7 @@ void apply(std::shared_ptr exec, const matrix::Dense* beta, matrix::Dense* c) { if (cublas::is_supported::value) { - if (c->get_size()[0] * c->get_size()[1] > 0) { + if (c->get_size()[0] > 0 && c->get_size()[1] > 0) { if (a->get_size()[1] > 0) { cublas::gemm( exec->get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N, diff --git a/cuda/matrix/ell_kernels.cu b/cuda/matrix/ell_kernels.cu index 14d7f1486bd..c906bb7fd1a 100644 --- a/cuda/matrix/ell_kernels.cu +++ b/cuda/matrix/ell_kernels.cu @@ -151,7 +151,7 @@ void abstract_spmv(syn::value_list, int num_worker_per_row, {static_cast(b->get_stride())}}); if (alpha == nullptr && beta == nullptr) { - if (grid_size.x * grid_size.y > 0) { + if (grid_size.x > 0 && grid_size.y > 0) { kernel::spmv <<>>( nrows, num_worker_per_row, acc::as_cuda_range(a_vals), @@ -162,7 +162,7 @@ void abstract_spmv(syn::value_list, int num_worker_per_row, } else if (alpha != nullptr && beta != nullptr) { const auto alpha_val = gko::acc::range( std::array{1}, alpha->get_const_values()); - if (grid_size.x * grid_size.y > 0) { + if (grid_size.x > 0 && grid_size.y > 0) { kernel::spmv <<>>( nrows, num_worker_per_row, acc::as_cuda_range(alpha_val), diff --git a/cuda/matrix/fbcsr_kernels.cu b/cuda/matrix/fbcsr_kernels.cu index a197d7833da..d0980029d80 100644 --- a/cuda/matrix/fbcsr_kernels.cu +++ b/cuda/matrix/fbcsr_kernels.cu @@ -265,11 +265,10 @@ void transpose_blocks_impl(syn::value_list, matrix::Fbcsr* const mat) { constexpr int subwarp_size = config::warp_size; - const size_type nbnz = mat->get_num_stored_blocks(); - const size_type numthreads = nbnz * subwarp_size; - const size_type numblocks = ceildiv(numthreads, default_block_size); - const auto block_size = static_cast(default_block_size); - const auto grid_dim = static_cast(numblocks); + const auto nbnz = mat->get_num_stored_blocks(); + const auto numthreads = nbnz * subwarp_size; + const auto block_size = default_block_size; + const auto grid_dim = ceildiv(numthreads, block_size); if (grid_dim > 0) { kernel::transpose_blocks <<>>(nbnz, mat->get_values()); diff --git a/cuda/matrix/sellp_kernels.cu b/cuda/matrix/sellp_kernels.cu index fcf17e9ef1d..aa0ddec3950 100644 --- a/cuda/matrix/sellp_kernels.cu +++ b/cuda/matrix/sellp_kernels.cu @@ -70,12 +70,11 @@ void spmv(std::shared_ptr exec, const matrix::Sellp* a, const matrix::Dense* b, matrix::Dense* c) { - const auto blockSize = default_block_size; - const dim3 gridSize(ceildiv(a->get_size()[0], default_block_size), - b->get_size()[1]); + const auto block_size = default_block_size; + const dim3 grid(ceildiv(a->get_size()[0], block_size), b->get_size()[1]); - if (gridSize.x * gridSize.y > 0) { - spmv_kernel<<>>( + if (grid.x > 0 && grid.y > 0) { + spmv_kernel<<>>( a->get_size()[0], b->get_size()[1], b->get_stride(), c->get_stride(), a->get_slice_size(), a->get_const_slice_sets(), as_cuda_type(a->get_const_values()), a->get_const_col_idxs(), @@ -94,12 +93,11 @@ void advanced_spmv(std::shared_ptr exec, const matrix::Dense* beta, matrix::Dense* c) { - const auto blockSize = default_block_size; - const dim3 gridSize(ceildiv(a->get_size()[0], default_block_size), - b->get_size()[1]); + const auto block_size = default_block_size; + const dim3 grid(ceildiv(a->get_size()[0], block_size), b->get_size()[1]); - if (gridSize.x * gridSize.y > 0) { - advanced_spmv_kernel<<>>( + if (grid.x > 0 && grid.y > 0) { + advanced_spmv_kernel<<>>( a->get_size()[0], b->get_size()[1], b->get_stride(), c->get_stride(), a->get_slice_size(), a->get_const_slice_sets(), as_cuda_type(alpha->get_const_values()), diff --git a/cuda/preconditioner/jacobi_kernels.cu b/cuda/preconditioner/jacobi_kernels.cu index 205f4208c82..c88eab57738 100644 --- a/cuda/preconditioner/jacobi_kernels.cu +++ b/cuda/preconditioner/jacobi_kernels.cu @@ -77,6 +77,7 @@ size_type find_natural_blocks(std::shared_ptr exec, { Array nums(exec, 1); + // FIXME: num_rows == 0 bug Array matching_next_row(exec, mtx->get_size()[0] - 1); const auto block_size = config::warp_size; diff --git a/cuda/test/matrix/csr_kernels.cpp b/cuda/test/matrix/csr_kernels.cpp index 0b1936f99ff..4351367d0ba 100644 --- a/cuda/test/matrix/csr_kernels.cpp +++ b/cuda/test/matrix/csr_kernels.cpp @@ -653,21 +653,6 @@ TEST_F(Csr, ConvertsEmptyToSellp) } -TEST_F(Csr, CalculatesNonzerosPerRow) -{ - set_up_apply_data(std::make_shared()); - gko::Array row_nnz(ref, mtx->get_size()[0]); - gko::Array drow_nnz(cuda, dmtx->get_size()[0]); - - gko::kernels::reference::csr::count_nonzeros_per_row(ref, mtx.get(), - row_nnz.get_data()); - gko::kernels::cuda::csr::count_nonzeros_per_row(cuda, dmtx.get(), - drow_nnz.get_data()); - - GKO_ASSERT_ARRAY_EQ(row_nnz, drow_nnz); -} - - TEST_F(Csr, ConvertToHybridIsEquivalentToRef) { using Hybrid_type = gko::matrix::Hybrid<>; diff --git a/cuda/test/matrix/ell_kernels.cpp b/cuda/test/matrix/ell_kernels.cpp index 5b35818c0b6..d770bb1673c 100644 --- a/cuda/test/matrix/ell_kernels.cpp +++ b/cuda/test/matrix/ell_kernels.cpp @@ -557,7 +557,6 @@ TEST_F(Ell, ConvertToCsrIsEquivalentToRef) TEST_F(Ell, CalculateNNZPerRowIsEquivalentToRef) { set_up_apply_data(); - gko::Array nnz_per_row{ref, mtx->get_size()[0]}; gko::Array dnnz_per_row{cuda, dmtx->get_size()[0]}; diff --git a/dpcpp/test/matrix/csr_kernels.cpp b/dpcpp/test/matrix/csr_kernels.cpp index 44c24dffd2a..d028a82850c 100644 --- a/dpcpp/test/matrix/csr_kernels.cpp +++ b/dpcpp/test/matrix/csr_kernels.cpp @@ -654,21 +654,6 @@ TEST_F(Csr, ConvertsEmptyToSellp) } -TEST_F(Csr, CalculatesNonzerosPerRow) -{ - set_up_apply_data(std::make_shared()); - gko::Array row_nnz(ref, mtx->get_size()[0]); - gko::Array drow_nnz(dpcpp, dmtx->get_size()[0]); - - gko::kernels::reference::csr::count_nonzeros_per_row(ref, mtx.get(), - row_nnz.get_data()); - gko::kernels::dpcpp::csr::count_nonzeros_per_row(dpcpp, dmtx.get(), - drow_nnz.get_data()); - - GKO_ASSERT_ARRAY_EQ(row_nnz, drow_nnz); -} - - TEST_F(Csr, ConvertToHybridIsEquivalentToRef) { using Hybrid_type = gko::matrix::Hybrid; diff --git a/dpcpp/test/matrix/ell_kernels.cpp b/dpcpp/test/matrix/ell_kernels.cpp index c8cd0503561..a3d85a16fd1 100644 --- a/dpcpp/test/matrix/ell_kernels.cpp +++ b/dpcpp/test/matrix/ell_kernels.cpp @@ -581,14 +581,8 @@ TEST_F(Ell, ConvertToCsrIsEquivalentToRef) TEST_F(Ell, CalculateNNZPerRowIsEquivalentToRef) { set_up_apply_data(); - - gko::Array nnz_per_row; - nnz_per_row.set_executor(ref); - nnz_per_row.resize_and_reset(mtx->get_size()[0]); - - gko::Array dnnz_per_row; - dnnz_per_row.set_executor(dpcpp); - dnnz_per_row.resize_and_reset(dmtx->get_size()[0]); + gko::Array nnz_per_row{ref, mtx->get_size()[0]}; + gko::Array dnnz_per_row{dpcpp, dmtx->get_size()[0]}; gko::kernels::reference::ell::count_nonzeros_per_row( ref, mtx.get(), nnz_per_row.get_data()); diff --git a/hip/base/kernel_launch.hip.hpp b/hip/base/kernel_launch.hip.hpp index 7d6a50612c2..de72fe33560 100644 --- a/hip/base/kernel_launch.hip.hpp +++ b/hip/base/kernel_launch.hip.hpp @@ -94,7 +94,7 @@ template void run_kernel(std::shared_ptr exec, KernelFunction fn, dim<2> size, KernelArgs&&... args) { - if (size[0] * size[1] > 0) { + if (size[0] > 0 && size[1] > 0) { constexpr auto block_size = default_block_size; auto num_blocks = ceildiv(size[0] * size[1], block_size); hipLaunchKernelGGL(generic_kernel_2d, num_blocks, block_size, 0, 0, diff --git a/hip/base/kernel_launch_reduction.hip.hpp b/hip/base/kernel_launch_reduction.hip.hpp index 44648307d2a..e93f811bfb1 100644 --- a/hip/base/kernel_launch_reduction.hip.hpp +++ b/hip/base/kernel_launch_reduction.hip.hpp @@ -371,11 +371,13 @@ void run_generic_kernel_row_reduction(syn::value_list, { const auto num_blocks = ceildiv(rows * col_blocks * subwarp_size, default_block_size); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(generic_kernel_row_reduction_2d), - num_blocks, default_block_size, 0, 0, rows, cols, col_blocks, fn, op, - finalize, as_hip_type(identity), as_hip_type(result), result_stride, - args...); + if (num_blocks > 0) { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(generic_kernel_row_reduction_2d), + num_blocks, default_block_size, 0, 0, rows, cols, col_blocks, fn, + op, finalize, as_hip_type(identity), as_hip_type(result), + result_stride, args...); + } } GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_generic_kernel_row_reduction, @@ -412,11 +414,14 @@ void run_generic_col_reduction_small(syn::value_list, num_blocks, default_block_size, 0, 0, rows, cols, fn, op, [] __device__(auto v) { return v; }, as_hip_type(identity), as_hip_type(tmp_storage.get_data()), args...); - hipLaunchKernelGGL( - generic_kernel_reduction_finalize_2d, - ceildiv(cols, default_block_size), default_block_size, 0, 0, cols, - num_blocks, op, finalize, as_hip_type(identity), - as_hip_type(tmp_storage.get_const_data()), 1, as_hip_type(result)); + if (cols > 0) { + hipLaunchKernelGGL(generic_kernel_reduction_finalize_2d, + ceildiv(cols, default_block_size), + default_block_size, 0, 0, cols, num_blocks, op, + finalize, as_hip_type(identity), + as_hip_type(tmp_storage.get_const_data()), 1, + as_hip_type(result)); + } } } @@ -448,6 +453,7 @@ void run_kernel_row_reduction(std::shared_ptr exec, static_cast(col_blocks * rows)}; const auto num_blocks = ceildiv(rows * col_blocks * config::warp_size, default_block_size); + // no need to guard this kernel, as rows * cols > resources hipLaunchKernelGGL( HIP_KERNEL_NAME(generic_kernel_row_reduction_2d), num_blocks, default_block_size, 0, 0, rows, cols, col_blocks, fn, @@ -513,6 +519,7 @@ void run_kernel_col_reduction(std::shared_ptr exec, } else { Array tmp_storage{ exec, static_cast(row_blocks * cols)}; + // no need to guard this kernel, as cols > warp_size, row_blocks > 1 hipLaunchKernelGGL( generic_kernel_col_reduction_2d_blocked, dim3(row_blocks, col_blocks), default_block_size, 0, 0, rows, diff --git a/hip/base/kernel_launch_solver.hip.hpp b/hip/base/kernel_launch_solver.hip.hpp index a7fb20515fe..adbe5046818 100644 --- a/hip/base/kernel_launch_solver.hip.hpp +++ b/hip/base/kernel_launch_solver.hip.hpp @@ -65,7 +65,7 @@ void run_kernel_solver(std::shared_ptr exec, KernelFunction fn, dim<2> size, size_type default_stride, KernelArgs&&... args) { - if (size[0] * size[1] > 0) { + if (size[0] > 0 && size[1] > 0) { constexpr auto block_size = kernels::hip::default_block_size; auto num_blocks = ceildiv(size[0] * size[1], block_size); hipLaunchKernelGGL(kernels::hip::generic_kernel_2d_solver, num_blocks, diff --git a/hip/factorization/factorization_kernels.hip.cpp b/hip/factorization/factorization_kernels.hip.cpp index d0d36f3e955..db4c4b18717 100644 --- a/hip/factorization/factorization_kernels.hip.cpp +++ b/hip/factorization/factorization_kernels.hip.cpp @@ -77,6 +77,9 @@ void add_diagonal_elements(std::shared_ptr exec, auto num_rows = static_cast(mtx_size[0]); auto num_cols = static_cast(mtx_size[1]); size_type row_ptrs_size = num_rows + 1; + if (num_rows == 0) { + return; + } Array row_ptrs_addition(exec, row_ptrs_size); Array needs_change_host{exec->get_master(), 1}; @@ -159,12 +162,14 @@ void initialize_row_ptrs_l_u( ceildiv(num_rows, static_cast(block_size)); const auto grid_dim = number_blocks; - hipLaunchKernelGGL(kernel::count_nnz_per_l_u_row, grid_dim, block_size, 0, - 0, num_rows, - as_hip_type(system_matrix->get_const_row_ptrs()), - as_hip_type(system_matrix->get_const_col_idxs()), - as_hip_type(system_matrix->get_const_values()), - as_hip_type(l_row_ptrs), as_hip_type(u_row_ptrs)); + if (grid_dim > 0) { + hipLaunchKernelGGL(kernel::count_nnz_per_l_u_row, grid_dim, block_size, + 0, 0, num_rows, + as_hip_type(system_matrix->get_const_row_ptrs()), + as_hip_type(system_matrix->get_const_col_idxs()), + as_hip_type(system_matrix->get_const_values()), + as_hip_type(l_row_ptrs), as_hip_type(u_row_ptrs)); + } components::prefix_sum(exec, l_row_ptrs, num_rows + 1); components::prefix_sum(exec, u_row_ptrs, num_rows + 1); @@ -185,15 +190,19 @@ void initialize_l_u(std::shared_ptr exec, const auto grid_dim = static_cast( ceildiv(num_rows, static_cast(block_size))); - hipLaunchKernelGGL( - kernel::initialize_l_u, grid_dim, block_size, 0, 0, num_rows, - as_hip_type(system_matrix->get_const_row_ptrs()), - as_hip_type(system_matrix->get_const_col_idxs()), - as_hip_type(system_matrix->get_const_values()), - as_hip_type(csr_l->get_const_row_ptrs()), - as_hip_type(csr_l->get_col_idxs()), as_hip_type(csr_l->get_values()), - as_hip_type(csr_u->get_const_row_ptrs()), - as_hip_type(csr_u->get_col_idxs()), as_hip_type(csr_u->get_values())); + if (grid_dim > 0) { + hipLaunchKernelGGL(kernel::initialize_l_u, grid_dim, block_size, 0, 0, + num_rows, + as_hip_type(system_matrix->get_const_row_ptrs()), + as_hip_type(system_matrix->get_const_col_idxs()), + as_hip_type(system_matrix->get_const_values()), + as_hip_type(csr_l->get_const_row_ptrs()), + as_hip_type(csr_l->get_col_idxs()), + as_hip_type(csr_l->get_values()), + as_hip_type(csr_u->get_const_row_ptrs()), + as_hip_type(csr_u->get_col_idxs()), + as_hip_type(csr_u->get_values())); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -213,12 +222,14 @@ void initialize_row_ptrs_l( ceildiv(num_rows, static_cast(block_size)); const auto grid_dim = number_blocks; - hipLaunchKernelGGL(kernel::count_nnz_per_l_row, grid_dim, block_size, 0, 0, - num_rows, - as_hip_type(system_matrix->get_const_row_ptrs()), - as_hip_type(system_matrix->get_const_col_idxs()), - as_hip_type(system_matrix->get_const_values()), - as_hip_type(l_row_ptrs)); + if (grid_dim > 0) { + hipLaunchKernelGGL(kernel::count_nnz_per_l_row, grid_dim, block_size, 0, + 0, num_rows, + as_hip_type(system_matrix->get_const_row_ptrs()), + as_hip_type(system_matrix->get_const_col_idxs()), + as_hip_type(system_matrix->get_const_values()), + as_hip_type(l_row_ptrs)); + } components::prefix_sum(exec, l_row_ptrs, num_rows + 1); } @@ -237,14 +248,16 @@ void initialize_l(std::shared_ptr exec, const auto grid_dim = static_cast( ceildiv(num_rows, static_cast(block_size))); - hipLaunchKernelGGL(kernel::initialize_l, grid_dim, block_size, 0, 0, - num_rows, - as_hip_type(system_matrix->get_const_row_ptrs()), - as_hip_type(system_matrix->get_const_col_idxs()), - as_hip_type(system_matrix->get_const_values()), - as_hip_type(csr_l->get_const_row_ptrs()), - as_hip_type(csr_l->get_col_idxs()), - as_hip_type(csr_l->get_values()), diag_sqrt); + if (grid_dim > 0) { + hipLaunchKernelGGL(kernel::initialize_l, grid_dim, block_size, 0, 0, + num_rows, + as_hip_type(system_matrix->get_const_row_ptrs()), + as_hip_type(system_matrix->get_const_col_idxs()), + as_hip_type(system_matrix->get_const_values()), + as_hip_type(csr_l->get_const_row_ptrs()), + as_hip_type(csr_l->get_col_idxs()), + as_hip_type(csr_l->get_values()), diag_sqrt); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( diff --git a/hip/factorization/par_ic_kernels.hip.cpp b/hip/factorization/par_ic_kernels.hip.cpp index e919f996b96..47cc332b70e 100644 --- a/hip/factorization/par_ic_kernels.hip.cpp +++ b/hip/factorization/par_ic_kernels.hip.cpp @@ -73,9 +73,11 @@ void init_factor(std::shared_ptr exec, auto num_blocks = ceildiv(num_rows, default_block_size); auto l_row_ptrs = l->get_const_row_ptrs(); auto l_vals = l->get_values(); - hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::ic_init), num_blocks, - default_block_size, 0, 0, l_row_ptrs, - as_hip_type(l_vals), num_rows); + if (num_blocks > 0) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::ic_init), num_blocks, + default_block_size, 0, 0, l_row_ptrs, + as_hip_type(l_vals), num_rows); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -90,13 +92,17 @@ void compute_factor(std::shared_ptr exec, { auto nnz = l->get_num_stored_elements(); auto num_blocks = ceildiv(nnz, default_block_size); - for (size_type i = 0; i < iterations; ++i) { - hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel::ic_sweep), num_blocks, default_block_size, - 0, 0, a_lower->get_const_row_idxs(), a_lower->get_const_col_idxs(), - as_hip_type(a_lower->get_const_values()), l->get_const_row_ptrs(), - l->get_const_col_idxs(), as_hip_type(l->get_values()), - static_cast(l->get_num_stored_elements())); + if (num_blocks > 0) { + for (size_type i = 0; i < iterations; ++i) { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::ic_sweep), num_blocks, + default_block_size, 0, 0, a_lower->get_const_row_idxs(), + a_lower->get_const_col_idxs(), + as_hip_type(a_lower->get_const_values()), + l->get_const_row_ptrs(), l->get_const_col_idxs(), + as_hip_type(l->get_values()), + static_cast(l->get_num_stored_elements())); + } } } diff --git a/hip/factorization/par_ict_kernels.hip.cpp b/hip/factorization/par_ict_kernels.hip.cpp index edd04ab93f6..05d08e598a7 100644 --- a/hip/factorization/par_ict_kernels.hip.cpp +++ b/hip/factorization/par_ict_kernels.hip.cpp @@ -106,10 +106,12 @@ void add_candidates(syn::value_list, auto l_vals = l->get_const_values(); auto l_new_row_ptrs = l_new->get_row_ptrs(); // count non-zeros per row - hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel::ict_tri_spgeam_nnz), num_blocks, - default_block_size, 0, 0, llh_row_ptrs, llh_col_idxs, a_row_ptrs, - a_col_idxs, l_new_row_ptrs, num_rows); + if (num_blocks > 0) { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::ict_tri_spgeam_nnz), + num_blocks, default_block_size, 0, 0, llh_row_ptrs, llh_col_idxs, + a_row_ptrs, a_col_idxs, l_new_row_ptrs, num_rows); + } // build row ptrs components::prefix_sum(exec, l_new_row_ptrs, num_rows + 1); @@ -123,12 +125,14 @@ void add_candidates(syn::value_list, auto l_new_vals = l_new->get_values(); // fill columns and values - hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel::ict_tri_spgeam_init), num_blocks, - default_block_size, 0, 0, llh_row_ptrs, llh_col_idxs, - as_hip_type(llh_vals), a_row_ptrs, a_col_idxs, as_hip_type(a_vals), - l_row_ptrs, l_col_idxs, as_hip_type(l_vals), l_new_row_ptrs, - l_new_col_idxs, as_hip_type(l_new_vals), num_rows); + if (num_blocks > 0) { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::ict_tri_spgeam_init), + num_blocks, default_block_size, 0, 0, llh_row_ptrs, llh_col_idxs, + as_hip_type(llh_vals), a_row_ptrs, a_col_idxs, as_hip_type(a_vals), + l_row_ptrs, l_col_idxs, as_hip_type(l_vals), l_new_row_ptrs, + l_new_col_idxs, as_hip_type(l_new_vals), num_rows); + } } @@ -145,13 +149,15 @@ void compute_factor(syn::value_list, auto total_nnz = static_cast(l->get_num_stored_elements()); auto block_size = default_block_size / subwarp_size; auto num_blocks = ceildiv(total_nnz, block_size); - hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::ict_sweep), - num_blocks, default_block_size, 0, 0, - a->get_const_row_ptrs(), a->get_const_col_idxs(), - as_hip_type(a->get_const_values()), - l->get_const_row_ptrs(), l_coo->get_const_row_idxs(), - l->get_const_col_idxs(), as_hip_type(l->get_values()), - static_cast(l->get_num_stored_elements())); + if (num_blocks > 0) { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::ict_sweep), num_blocks, + default_block_size, 0, 0, a->get_const_row_ptrs(), + a->get_const_col_idxs(), as_hip_type(a->get_const_values()), + l->get_const_row_ptrs(), l_coo->get_const_row_idxs(), + l->get_const_col_idxs(), as_hip_type(l->get_values()), + static_cast(l->get_num_stored_elements())); + } } diff --git a/hip/factorization/par_ilu_kernels.hip.cpp b/hip/factorization/par_ilu_kernels.hip.cpp index c29a554078c..c4b0fadb811 100644 --- a/hip/factorization/par_ilu_kernels.hip.cpp +++ b/hip/factorization/par_ilu_kernels.hip.cpp @@ -74,16 +74,18 @@ void compute_l_u_factors(std::shared_ptr exec, const auto block_size = default_block_size; const auto grid_dim = static_cast( ceildiv(num_elements, static_cast(block_size))); - for (size_type i = 0; i < iterations; ++i) { - hipLaunchKernelGGL( - kernel::compute_l_u_factors, grid_dim, block_size, 0, 0, - num_elements, system_matrix->get_const_row_idxs(), - system_matrix->get_const_col_idxs(), - as_hip_type(system_matrix->get_const_values()), - l_factor->get_const_row_ptrs(), l_factor->get_const_col_idxs(), - as_hip_type(l_factor->get_values()), u_factor->get_const_row_ptrs(), - u_factor->get_const_col_idxs(), - as_hip_type(u_factor->get_values())); + if (grid_dim > 0) { + for (size_type i = 0; i < iterations; ++i) { + hipLaunchKernelGGL( + kernel::compute_l_u_factors, grid_dim, block_size, 0, 0, + num_elements, system_matrix->get_const_row_idxs(), + system_matrix->get_const_col_idxs(), + as_hip_type(system_matrix->get_const_values()), + l_factor->get_const_row_ptrs(), l_factor->get_const_col_idxs(), + as_hip_type(l_factor->get_values()), + u_factor->get_const_row_ptrs(), u_factor->get_const_col_idxs(), + as_hip_type(u_factor->get_values())); + } } } diff --git a/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp b/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp index ce6fba1b7f3..d484af93d2a 100644 --- a/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp +++ b/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp @@ -142,9 +142,12 @@ void threshold_filter_approx(syn::value_list, auto block_size = default_block_size / subwarp_size; auto num_blocks = ceildiv(num_rows, block_size); auto new_row_ptrs = m_out->get_row_ptrs(); - hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::bucket_filter_nnz), - num_blocks, default_block_size, 0, 0, old_row_ptrs, - oracles, num_rows, bucket, new_row_ptrs); + if (num_blocks > 0) { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::bucket_filter_nnz), + num_blocks, default_block_size, 0, 0, old_row_ptrs, oracles, + num_rows, bucket, new_row_ptrs); + } // build row pointers components::prefix_sum(exec, new_row_ptrs, num_rows + 1); @@ -167,11 +170,13 @@ void threshold_filter_approx(syn::value_list, Array::view(exec, new_nnz, new_vals); new_row_idxs = m_out_coo->get_row_idxs(); } - hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::bucket_filter), - num_blocks, default_block_size, 0, 0, old_row_ptrs, - old_col_idxs, as_hip_type(old_vals), oracles, num_rows, - bucket, new_row_ptrs, new_row_idxs, new_col_idxs, - as_hip_type(new_vals)); + if (num_blocks > 0) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::bucket_filter), + num_blocks, default_block_size, 0, 0, old_row_ptrs, + old_col_idxs, as_hip_type(old_vals), oracles, + num_rows, bucket, new_row_ptrs, new_row_idxs, + new_col_idxs, as_hip_type(new_vals)); + } } diff --git a/hip/factorization/par_ilut_filter_kernel.hip.cpp b/hip/factorization/par_ilut_filter_kernel.hip.cpp index c7845611700..cf691f58f8b 100644 --- a/hip/factorization/par_ilut_filter_kernel.hip.cpp +++ b/hip/factorization/par_ilut_filter_kernel.hip.cpp @@ -97,10 +97,12 @@ void threshold_filter(syn::value_list, auto block_size = default_block_size / subwarp_size; auto num_blocks = ceildiv(num_rows, block_size); auto new_row_ptrs = m_out->get_row_ptrs(); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel::threshold_filter_nnz), num_blocks, - default_block_size, 0, 0, old_row_ptrs, as_hip_type(old_vals), num_rows, - threshold, new_row_ptrs, lower); + if (num_blocks > 0) { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::threshold_filter_nnz), + num_blocks, default_block_size, 0, 0, old_row_ptrs, + as_hip_type(old_vals), num_rows, threshold, new_row_ptrs, lower); + } // build row pointers components::prefix_sum(exec, new_row_ptrs, num_rows + 1); @@ -123,11 +125,13 @@ void threshold_filter(syn::value_list, Array::view(exec, new_nnz, new_vals); new_row_idxs = m_out_coo->get_row_idxs(); } - hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::threshold_filter), - num_blocks, default_block_size, 0, 0, old_row_ptrs, - old_col_idxs, as_hip_type(old_vals), num_rows, threshold, - new_row_ptrs, new_row_idxs, new_col_idxs, - as_hip_type(new_vals), lower); + if (num_blocks > 0) { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::threshold_filter), num_blocks, + default_block_size, 0, 0, old_row_ptrs, old_col_idxs, + as_hip_type(old_vals), num_rows, threshold, new_row_ptrs, + new_row_idxs, new_col_idxs, as_hip_type(new_vals), lower); + } } diff --git a/hip/factorization/par_ilut_select_common.hip.cpp b/hip/factorization/par_ilut_select_common.hip.cpp index 3e6ae96361a..fd29b5dcd11 100644 --- a/hip/factorization/par_ilut_select_common.hip.cpp +++ b/hip/factorization/par_ilut_select_common.hip.cpp @@ -78,9 +78,11 @@ void sampleselect_count(std::shared_ptr exec, hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::build_searchtree), 1, bucket_count, 0, 0, as_hip_type(values), size, tree); // determine bucket sizes - hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::count_buckets), num_blocks, - default_block_size, 0, 0, as_hip_type(values), size, - tree, partial_counts, oracles, items_per_thread); + if (num_blocks > 0) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::count_buckets), num_blocks, + default_block_size, 0, 0, as_hip_type(values), size, + tree, partial_counts, oracles, items_per_thread); + } // compute prefix sum and total sum over block-local values hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::block_prefix_sum), bucket_count, default_block_size, 0, 0, partial_counts, total_counts, diff --git a/hip/factorization/par_ilut_select_kernel.hip.cpp b/hip/factorization/par_ilut_select_kernel.hip.cpp index 80d8c37ee60..8130a76cb18 100644 --- a/hip/factorization/par_ilut_select_kernel.hip.cpp +++ b/hip/factorization/par_ilut_select_kernel.hip.cpp @@ -78,9 +78,12 @@ void sampleselect_filter(const ValueType* values, IndexType size, auto num_threads_total = ceildiv(size, items_per_thread); auto num_blocks = static_cast(ceildiv(num_threads_total, default_block_size)); - hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::filter_bucket), num_blocks, - default_block_size, 0, 0, as_hip_type(values), size, - bucket, oracles, partial_counts, out, items_per_thread); + if (num_blocks > 0) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::filter_bucket), num_blocks, + default_block_size, 0, 0, as_hip_type(values), size, + bucket, oracles, partial_counts, out, + items_per_thread); + } } diff --git a/hip/factorization/par_ilut_spgeam_kernel.hip.cpp b/hip/factorization/par_ilut_spgeam_kernel.hip.cpp index 7dc4f902eea..f8234cc6a04 100644 --- a/hip/factorization/par_ilut_spgeam_kernel.hip.cpp +++ b/hip/factorization/par_ilut_spgeam_kernel.hip.cpp @@ -111,11 +111,13 @@ void add_candidates(syn::value_list, auto u_vals = u->get_const_values(); auto l_new_row_ptrs = l_new->get_row_ptrs(); auto u_new_row_ptrs = u_new->get_row_ptrs(); - // count non-zeros per row - hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::tri_spgeam_nnz), - num_blocks, default_block_size, 0, 0, lu_row_ptrs, - lu_col_idxs, a_row_ptrs, a_col_idxs, l_new_row_ptrs, - u_new_row_ptrs, num_rows); + if (num_blocks > 0) { + // count non-zeros per row + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::tri_spgeam_nnz), num_blocks, + default_block_size, 0, 0, lu_row_ptrs, lu_col_idxs, a_row_ptrs, + a_col_idxs, l_new_row_ptrs, u_new_row_ptrs, num_rows); + } // build row ptrs components::prefix_sum(exec, l_new_row_ptrs, num_rows + 1); @@ -134,15 +136,17 @@ void add_candidates(syn::value_list, auto u_new_col_idxs = u_new->get_col_idxs(); auto u_new_vals = u_new->get_values(); - // fill columns and values - hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::tri_spgeam_init), - num_blocks, default_block_size, 0, 0, lu_row_ptrs, - lu_col_idxs, as_hip_type(lu_vals), a_row_ptrs, - a_col_idxs, as_hip_type(a_vals), l_row_ptrs, l_col_idxs, - as_hip_type(l_vals), u_row_ptrs, u_col_idxs, - as_hip_type(u_vals), l_new_row_ptrs, l_new_col_idxs, - as_hip_type(l_new_vals), u_new_row_ptrs, u_new_col_idxs, - as_hip_type(u_new_vals), num_rows); + if (num_blocks > 0) { + // fill columns and values + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::tri_spgeam_init), num_blocks, + default_block_size, 0, 0, lu_row_ptrs, lu_col_idxs, + as_hip_type(lu_vals), a_row_ptrs, a_col_idxs, as_hip_type(a_vals), + l_row_ptrs, l_col_idxs, as_hip_type(l_vals), u_row_ptrs, u_col_idxs, + as_hip_type(u_vals), l_new_row_ptrs, l_new_col_idxs, + as_hip_type(l_new_vals), u_new_row_ptrs, u_new_col_idxs, + as_hip_type(u_new_vals), num_rows); + } } diff --git a/hip/factorization/par_ilut_sweep_kernel.hip.cpp b/hip/factorization/par_ilut_sweep_kernel.hip.cpp index 9463115cda4..e063fd81d0d 100644 --- a/hip/factorization/par_ilut_sweep_kernel.hip.cpp +++ b/hip/factorization/par_ilut_sweep_kernel.hip.cpp @@ -96,17 +96,19 @@ void compute_l_u_factors(syn::value_list, u->get_num_stored_elements()); auto block_size = default_block_size / subwarp_size; auto num_blocks = ceildiv(total_nnz, block_size); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel::sweep), num_blocks, - default_block_size, 0, 0, a->get_const_row_ptrs(), - a->get_const_col_idxs(), as_hip_type(a->get_const_values()), - l->get_const_row_ptrs(), l_coo->get_const_row_idxs(), - l->get_const_col_idxs(), as_hip_type(l->get_values()), - static_cast(l->get_num_stored_elements()), - u_coo->get_const_row_idxs(), u_coo->get_const_col_idxs(), - as_hip_type(u->get_values()), u_csc->get_const_row_ptrs(), - u_csc->get_const_col_idxs(), as_hip_type(u_csc->get_values()), - static_cast(u->get_num_stored_elements())); + if (num_blocks > 0) { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::sweep), num_blocks, + default_block_size, 0, 0, a->get_const_row_ptrs(), + a->get_const_col_idxs(), as_hip_type(a->get_const_values()), + l->get_const_row_ptrs(), l_coo->get_const_row_idxs(), + l->get_const_col_idxs(), as_hip_type(l->get_values()), + static_cast(l->get_num_stored_elements()), + u_coo->get_const_row_idxs(), u_coo->get_const_col_idxs(), + as_hip_type(u->get_values()), u_csc->get_const_row_ptrs(), + u_csc->get_const_col_idxs(), as_hip_type(u_csc->get_values()), + static_cast(u->get_num_stored_elements())); + } } diff --git a/hip/matrix/coo_kernels.hip.cpp b/hip/matrix/coo_kernels.hip.cpp index 36b053e25f6..ea5922b4ff4 100644 --- a/hip/matrix/coo_kernels.hip.cpp +++ b/hip/matrix/coo_kernels.hip.cpp @@ -117,7 +117,7 @@ void spmv2(std::shared_ptr exec, const dim3 coo_block(config::warp_size, warps_in_block, 1); const auto nwarps = host_kernel::calculate_nwarps(exec, nnz); - if (nwarps > 0) { + if (nwarps > 0 && b_ncols > 0) { // TODO: b_ncols needs to be tuned. if (b_ncols < 4) { const dim3 coo_grid(ceildiv(nwarps, warps_in_block), b_ncols); @@ -158,7 +158,7 @@ void advanced_spmv2(std::shared_ptr exec, const dim3 coo_block(config::warp_size, warps_in_block, 1); const auto b_ncols = b->get_size()[1]; - if (nwarps > 0) { + if (nwarps > 0 && b_ncols > 0) { // TODO: b_ncols needs to be tuned. if (b_ncols < 4) { int num_lines = ceildiv(nnz, nwarps * config::warp_size); diff --git a/hip/matrix/csr_kernels.hip.cpp b/hip/matrix/csr_kernels.hip.cpp index cf1f7eea2a7..eafbd367c0e 100644 --- a/hip/matrix/csr_kernels.hip.cpp +++ b/hip/matrix/csr_kernels.hip.cpp @@ -277,7 +277,9 @@ void spmv(std::shared_ptr exec, const matrix::Csr* a, const matrix::Dense* b, matrix::Dense* c) { - if (a->get_strategy()->get_name() == "load_balance") { + if (c->get_size()[0] == 0 || c->get_size()[1] == 0) { + // empty output: nothing to do + } else if (a->get_strategy()->get_name() == "load_balance") { components::fill_array(exec, c->get_values(), c->get_num_stored_elements(), zero()); const IndexType nwarps = a->get_num_srow_elements(); @@ -370,7 +372,9 @@ void advanced_spmv(std::shared_ptr exec, const matrix::Dense* beta, matrix::Dense* c) { - if (a->get_strategy()->get_name() == "load_balance") { + if (c->get_size()[0] == 0 || c->get_size()[1] == 0) { + // empty output: nothing to do + } else if (a->get_strategy()->get_name() == "load_balance") { dense::scale(exec, beta, c); const IndexType nwarps = a->get_num_srow_elements(); @@ -550,9 +554,11 @@ void spgeam(syn::value_list, // count nnz for alpha * A + beta * B auto subwarps_per_block = default_block_size / subwarp_size; auto num_blocks = ceildiv(m, subwarps_per_block); - hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::spgeam_nnz), - num_blocks, default_block_size, 0, 0, a_row_ptrs, - a_col_idxs, b_row_ptrs, b_col_idxs, m, c_row_ptrs); + if (num_blocks > 0) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::spgeam_nnz), + num_blocks, default_block_size, 0, 0, a_row_ptrs, + a_col_idxs, b_row_ptrs, b_col_idxs, m, c_row_ptrs); + } // build row pointers components::prefix_sum(exec, c_row_ptrs, m + 1); @@ -564,11 +570,14 @@ void spgeam(syn::value_list, c_builder.get_value_array().resize_and_reset(c_nnz); auto c_col_idxs = c->get_col_idxs(); auto c_vals = c->get_values(); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel::spgeam), num_blocks, - default_block_size, 0, 0, as_hip_type(alpha), a_row_ptrs, a_col_idxs, - as_hip_type(a_vals), as_hip_type(beta), b_row_ptrs, b_col_idxs, - as_hip_type(b_vals), m, c_row_ptrs, c_col_idxs, as_hip_type(c_vals)); + if (num_blocks > 0) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::spgeam), + num_blocks, default_block_size, 0, 0, + as_hip_type(alpha), a_row_ptrs, a_col_idxs, + as_hip_type(a_vals), as_hip_type(beta), b_row_ptrs, + b_col_idxs, as_hip_type(b_vals), m, c_row_ptrs, + c_col_idxs, as_hip_type(c_vals)); + } } GKO_ENABLE_IMPLEMENTATION_SELECTION(select_spgeam, spgeam); @@ -711,10 +720,12 @@ void fill_in_dense(std::shared_ptr exec, const auto vals = source->get_const_values(); auto grid_dim = ceildiv(num_rows, default_block_size); - hipLaunchKernelGGL(kernel::fill_in_dense, grid_dim, default_block_size, 0, - 0, num_rows, as_hip_type(row_ptrs), - as_hip_type(col_idxs), as_hip_type(vals), stride, - as_hip_type(result->get_values())); + if (grid_dim > 0) { + hipLaunchKernelGGL(kernel::fill_in_dense, grid_dim, default_block_size, + 0, 0, num_rows, as_hip_type(row_ptrs), + as_hip_type(col_idxs), as_hip_type(vals), stride, + as_hip_type(result->get_values())); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -774,9 +785,11 @@ void conj_transpose(std::shared_ptr exec, orig->get_const_col_idxs(), trans->get_values(), trans->get_row_ptrs(), trans->get_col_idxs(), copyValues, idxBase); - hipLaunchKernelGGL(conjugate_kernel, grid_size, block_size, 0, 0, - trans->get_num_stored_elements(), - as_hip_type(trans->get_values())); + if (grid_size > 0) { + hipLaunchKernelGGL(conjugate_kernel, grid_size, block_size, 0, 0, + trans->get_num_stored_elements(), + as_hip_type(trans->get_values())); + } } else { GKO_NOT_IMPLEMENTED; } @@ -794,19 +807,23 @@ void inv_symm_permute(std::shared_ptr exec, { auto num_rows = orig->get_size()[0]; auto count_num_blocks = ceildiv(num_rows, default_block_size); - hipLaunchKernelGGL(HIP_KERNEL_NAME(inv_row_ptr_permute_kernel), - count_num_blocks, default_block_size, 0, 0, num_rows, - perm, orig->get_const_row_ptrs(), - permuted->get_row_ptrs()); + if (count_num_blocks > 0) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(inv_row_ptr_permute_kernel), + count_num_blocks, default_block_size, 0, 0, num_rows, + perm, orig->get_const_row_ptrs(), + permuted->get_row_ptrs()); + } components::prefix_sum(exec, permuted->get_row_ptrs(), num_rows + 1); auto copy_num_blocks = ceildiv(num_rows, default_block_size / config::warp_size); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(inv_symm_permute_kernel), - copy_num_blocks, default_block_size, 0, 0, num_rows, perm, - orig->get_const_row_ptrs(), orig->get_const_col_idxs(), - as_hip_type(orig->get_const_values()), permuted->get_row_ptrs(), - permuted->get_col_idxs(), as_hip_type(permuted->get_values())); + if (copy_num_blocks > 0) { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(inv_symm_permute_kernel), + copy_num_blocks, default_block_size, 0, 0, num_rows, perm, + orig->get_const_row_ptrs(), orig->get_const_col_idxs(), + as_hip_type(orig->get_const_values()), permuted->get_row_ptrs(), + permuted->get_col_idxs(), as_hip_type(permuted->get_values())); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -820,19 +837,24 @@ void row_permute(std::shared_ptr exec, const IndexType* perm, { auto num_rows = orig->get_size()[0]; auto count_num_blocks = ceildiv(num_rows, default_block_size); - hipLaunchKernelGGL(HIP_KERNEL_NAME(row_ptr_permute_kernel), - count_num_blocks, default_block_size, 0, 0, num_rows, - perm, orig->get_const_row_ptrs(), - row_permuted->get_row_ptrs()); + if (count_num_blocks > 0) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(row_ptr_permute_kernel), + count_num_blocks, default_block_size, 0, 0, num_rows, + perm, orig->get_const_row_ptrs(), + row_permuted->get_row_ptrs()); + } components::prefix_sum(exec, row_permuted->get_row_ptrs(), num_rows + 1); auto copy_num_blocks = ceildiv(num_rows, default_block_size / config::warp_size); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(row_permute_kernel), copy_num_blocks, - default_block_size, 0, 0, num_rows, perm, orig->get_const_row_ptrs(), - orig->get_const_col_idxs(), as_hip_type(orig->get_const_values()), - row_permuted->get_row_ptrs(), row_permuted->get_col_idxs(), - as_hip_type(row_permuted->get_values())); + if (copy_num_blocks > 0) { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(row_permute_kernel), + copy_num_blocks, default_block_size, 0, 0, num_rows, perm, + orig->get_const_row_ptrs(), orig->get_const_col_idxs(), + as_hip_type(orig->get_const_values()), row_permuted->get_row_ptrs(), + row_permuted->get_col_idxs(), + as_hip_type(row_permuted->get_values())); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -847,19 +869,24 @@ void inverse_row_permute(std::shared_ptr exec, { auto num_rows = orig->get_size()[0]; auto count_num_blocks = ceildiv(num_rows, default_block_size); - hipLaunchKernelGGL(HIP_KERNEL_NAME(inv_row_ptr_permute_kernel), - count_num_blocks, default_block_size, 0, 0, num_rows, - perm, orig->get_const_row_ptrs(), - row_permuted->get_row_ptrs()); + if (count_num_blocks > 0) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(inv_row_ptr_permute_kernel), + count_num_blocks, default_block_size, 0, 0, num_rows, + perm, orig->get_const_row_ptrs(), + row_permuted->get_row_ptrs()); + } components::prefix_sum(exec, row_permuted->get_row_ptrs(), num_rows + 1); auto copy_num_blocks = ceildiv(num_rows, default_block_size / config::warp_size); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(inv_row_permute_kernel), - copy_num_blocks, default_block_size, 0, 0, num_rows, perm, - orig->get_const_row_ptrs(), orig->get_const_col_idxs(), - as_hip_type(orig->get_const_values()), row_permuted->get_row_ptrs(), - row_permuted->get_col_idxs(), as_hip_type(row_permuted->get_values())); + if (copy_num_blocks > 0) { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(inv_row_permute_kernel), + copy_num_blocks, default_block_size, 0, 0, num_rows, perm, + orig->get_const_row_ptrs(), orig->get_const_col_idxs(), + as_hip_type(orig->get_const_values()), row_permuted->get_row_ptrs(), + row_permuted->get_col_idxs(), + as_hip_type(row_permuted->get_values())); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -877,10 +904,12 @@ void calculate_nonzeros_per_row_in_span( auto col_idxs = source->get_const_col_idxs(); auto grid_dim = ceildiv(row_span.length(), default_block_size); - hipLaunchKernelGGL(kernel::calculate_nnz_per_row_in_span, grid_dim, - default_block_size, 0, 0, row_span, col_span, - as_hip_type(row_ptrs), as_hip_type(col_idxs), - as_hip_type(row_nnz->get_data())); + if (grid_dim > 0) { + hipLaunchKernelGGL(kernel::calculate_nnz_per_row_in_span, grid_dim, + default_block_size, 0, 0, row_span, col_span, + as_hip_type(row_ptrs), as_hip_type(col_idxs), + as_hip_type(row_nnz->get_data())); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -899,17 +928,17 @@ void compute_submatrix(std::shared_ptr exec, auto num_cols = result->get_size()[1]; auto row_ptrs = source->get_const_row_ptrs(); auto grid_dim = ceildiv(num_rows, default_block_size); - - auto num_nnz = source->get_num_stored_elements(); - grid_dim = ceildiv(num_nnz, default_block_size); - hipLaunchKernelGGL( - kernel::compute_submatrix_idxs_and_vals, grid_dim, default_block_size, - 0, 0, num_rows, num_cols, num_nnz, row_offset, col_offset, - as_hip_type(source->get_const_row_ptrs()), - as_hip_type(source->get_const_col_idxs()), - as_hip_type(source->get_const_values()), - as_hip_type(result->get_const_row_ptrs()), - as_hip_type(result->get_col_idxs()), as_hip_type(result->get_values())); + if (grid_dim > 0) { + hipLaunchKernelGGL(kernel::compute_submatrix_idxs_and_vals, grid_dim, + default_block_size, 0, 0, num_rows, num_cols, + row_offset, col_offset, + as_hip_type(source->get_const_row_ptrs()), + as_hip_type(source->get_const_col_idxs()), + as_hip_type(source->get_const_values()), + as_hip_type(result->get_const_row_ptrs()), + as_hip_type(result->get_col_idxs()), + as_hip_type(result->get_values())); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -975,10 +1004,12 @@ void is_sorted_by_column_index( auto block_size = default_block_size; auto num_rows = static_cast(to_check->get_size()[0]); auto num_blocks = ceildiv(num_rows, block_size); - hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::check_unsorted), num_blocks, - block_size, 0, 0, to_check->get_const_row_ptrs(), - to_check->get_const_col_idxs(), num_rows, - gpu_array.get_data()); + if (num_blocks > 0) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::check_unsorted), num_blocks, + block_size, 0, 0, to_check->get_const_row_ptrs(), + to_check->get_const_col_idxs(), num_rows, + gpu_array.get_data()); + } cpu_array = gpu_array; } @@ -1000,11 +1031,13 @@ void extract_diagonal(std::shared_ptr exec, const auto orig_row_ptrs = orig->get_const_row_ptrs(); const auto orig_col_idxs = orig->get_const_col_idxs(); auto diag_values = diag->get_values(); - - hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::extract_diagonal), num_blocks, - default_block_size, 0, 0, diag_size, nnz, - as_hip_type(orig_values), as_hip_type(orig_row_ptrs), - as_hip_type(orig_col_idxs), as_hip_type(diag_values)); + if (num_blocks > 0) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::extract_diagonal), + num_blocks, default_block_size, 0, 0, diag_size, nnz, + as_hip_type(orig_values), as_hip_type(orig_row_ptrs), + as_hip_type(orig_col_idxs), + as_hip_type(diag_values)); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_EXTRACT_DIAGONAL); diff --git a/hip/matrix/dense_kernels.hip.cpp b/hip/matrix/dense_kernels.hip.cpp index 7386b38f692..188deeb7a0f 100644 --- a/hip/matrix/dense_kernels.hip.cpp +++ b/hip/matrix/dense_kernels.hip.cpp @@ -84,7 +84,7 @@ void simple_apply(std::shared_ptr exec, { if (hipblas::is_supported::value) { auto handle = exec->get_hipblas_handle(); - if (c->get_size()[0] * c->get_size()[1] > 0) { + if (c->get_size()[0] > 0 && c->get_size()[1] > 0) { if (a->get_size()[1] > 0) { hipblas::pointer_mode_guard pm_guard(handle); auto alpha = one(); @@ -114,7 +114,7 @@ void apply(std::shared_ptr exec, const matrix::Dense* beta, matrix::Dense* c) { if (hipblas::is_supported::value) { - if (c->get_size()[0] * c->get_size()[1] > 0) { + if (c->get_size()[0] > 0 && c->get_size()[1] > 0) { if (a->get_size()[1] > 0) { hipblas::gemm( exec->get_hipblas_handle(), HIPBLAS_OP_N, HIPBLAS_OP_N, diff --git a/hip/matrix/diagonal_kernels.hip.cpp b/hip/matrix/diagonal_kernels.hip.cpp index e6e26a7be69..ca4f1e7754c 100644 --- a/hip/matrix/diagonal_kernels.hip.cpp +++ b/hip/matrix/diagonal_kernels.hip.cpp @@ -77,9 +77,11 @@ void apply_to_csr(std::shared_ptr exec, const auto grid_dim = ceildiv(num_rows * config::warp_size, default_block_size); - hipLaunchKernelGGL(kernel::apply_to_csr, grid_dim, default_block_size, 0, 0, - num_rows, as_hip_type(diag_values), - as_hip_type(csr_row_ptrs), as_hip_type(csr_values)); + if (grid_dim > 0) { + hipLaunchKernelGGL(kernel::apply_to_csr, grid_dim, default_block_size, + 0, 0, num_rows, as_hip_type(diag_values), + as_hip_type(csr_row_ptrs), as_hip_type(csr_values)); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( diff --git a/hip/matrix/ell_kernels.hip.cpp b/hip/matrix/ell_kernels.hip.cpp index 6f7e882f429..2486a666ab0 100644 --- a/hip/matrix/ell_kernels.hip.cpp +++ b/hip/matrix/ell_kernels.hip.cpp @@ -154,22 +154,27 @@ void abstract_spmv(syn::value_list, int num_worker_per_row, {static_cast(b->get_stride())}}); if (alpha == nullptr && beta == nullptr) { - hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel::spmv), - grid_size, block_size, 0, 0, nrows, num_worker_per_row, - acc::as_hip_range(a_vals), a->get_const_col_idxs(), stride, - num_stored_elements_per_row, acc::as_hip_range(b_vals), - as_hip_type(c->get_values()), c->get_stride()); + if (grid_size.x > 0 && grid_size.y > 0) { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::spmv), + grid_size, block_size, 0, 0, nrows, num_worker_per_row, + acc::as_hip_range(a_vals), a->get_const_col_idxs(), stride, + num_stored_elements_per_row, acc::as_hip_range(b_vals), + as_hip_type(c->get_values()), c->get_stride()); + } } else if (alpha != nullptr && beta != nullptr) { - const auto alpha_val = acc::range( - std::array{1}, alpha->get_const_values()); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel::spmv), - grid_size, block_size, 0, 0, nrows, num_worker_per_row, - acc::as_hip_range(alpha_val), acc::as_hip_range(a_vals), - a->get_const_col_idxs(), stride, num_stored_elements_per_row, - acc::as_hip_range(b_vals), as_hip_type(beta->get_const_values()), - as_hip_type(c->get_values()), c->get_stride()); + if (grid_size.x > 0 && grid_size.y > 0) { + const auto alpha_val = acc::range( + std::array{1}, alpha->get_const_values()); + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::spmv), + grid_size, block_size, 0, 0, nrows, num_worker_per_row, + acc::as_hip_range(alpha_val), acc::as_hip_range(a_vals), + a->get_const_col_idxs(), stride, num_stored_elements_per_row, + acc::as_hip_range(b_vals), + as_hip_type(beta->get_const_values()), + as_hip_type(c->get_values()), c->get_stride()); + } } else { GKO_KERNEL_NOT_FOUND; } diff --git a/hip/matrix/fbcsr_kernels.hip.cpp b/hip/matrix/fbcsr_kernels.hip.cpp index 2739a173fcc..c693405da56 100644 --- a/hip/matrix/fbcsr_kernels.hip.cpp +++ b/hip/matrix/fbcsr_kernels.hip.cpp @@ -115,12 +115,15 @@ void convert_to_csr(const std::shared_ptr exec, constexpr auto warps_per_block = default_block_size / config::warp_size; const auto num_blocks = ceildiv(source->get_num_block_rows(), warps_per_block); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel::convert_to_csr), num_blocks, default_block_size, - 0, 0, source->get_const_row_ptrs(), source->get_const_col_idxs(), - as_hip_type(source->get_const_values()), result->get_row_ptrs(), - result->get_col_idxs(), as_hip_type(result->get_values()), - source->get_num_block_rows(), source->get_block_size()); + if (num_blocks > 0) { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::convert_to_csr), num_blocks, + default_block_size, 0, 0, source->get_const_row_ptrs(), + source->get_const_col_idxs(), + as_hip_type(source->get_const_values()), result->get_row_ptrs(), + result->get_col_idxs(), as_hip_type(result->get_values()), + source->get_num_block_rows(), source->get_block_size()); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( diff --git a/hip/matrix/sellp_kernels.hip.cpp b/hip/matrix/sellp_kernels.hip.cpp index 507a6f9ef2a..e3a618e0925 100644 --- a/hip/matrix/sellp_kernels.hip.cpp +++ b/hip/matrix/sellp_kernels.hip.cpp @@ -73,16 +73,18 @@ void spmv(std::shared_ptr exec, const matrix::Sellp* a, const matrix::Dense* b, matrix::Dense* c) { - const auto blockSize = default_block_size; - const dim3 gridSize(ceildiv(a->get_size()[0], default_block_size), - b->get_size()[1]); - - hipLaunchKernelGGL( - spmv_kernel, gridSize, blockSize, 0, 0, a->get_size()[0], - b->get_size()[1], b->get_stride(), c->get_stride(), a->get_slice_size(), - a->get_const_slice_sets(), as_hip_type(a->get_const_values()), - a->get_const_col_idxs(), as_hip_type(b->get_const_values()), - as_hip_type(c->get_values())); + const auto block_size = default_block_size; + const dim3 grid(ceildiv(a->get_size()[0], default_block_size), + b->get_size()[1]); + + if (grid.x > 0 && grid.y > 0) { + hipLaunchKernelGGL( + spmv_kernel, grid, block_size, 0, 0, a->get_size()[0], + b->get_size()[1], b->get_stride(), c->get_stride(), + a->get_slice_size(), a->get_const_slice_sets(), + as_hip_type(a->get_const_values()), a->get_const_col_idxs(), + as_hip_type(b->get_const_values()), as_hip_type(c->get_values())); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SELLP_SPMV_KERNEL); @@ -96,17 +98,21 @@ void advanced_spmv(std::shared_ptr exec, const matrix::Dense* beta, matrix::Dense* c) { - const auto blockSize = default_block_size; - const dim3 gridSize(ceildiv(a->get_size()[0], default_block_size), - b->get_size()[1]); - - hipLaunchKernelGGL( - advanced_spmv_kernel, gridSize, blockSize, 0, 0, a->get_size()[0], - b->get_size()[1], b->get_stride(), c->get_stride(), a->get_slice_size(), - a->get_const_slice_sets(), as_hip_type(alpha->get_const_values()), - as_hip_type(a->get_const_values()), a->get_const_col_idxs(), - as_hip_type(b->get_const_values()), - as_hip_type(beta->get_const_values()), as_hip_type(c->get_values())); + const auto block_size = default_block_size; + const dim3 grid(ceildiv(a->get_size()[0], default_block_size), + b->get_size()[1]); + + if (grid.x > 0 && grid.y > 0) { + hipLaunchKernelGGL( + advanced_spmv_kernel, grid, block_size, 0, 0, a->get_size()[0], + b->get_size()[1], b->get_stride(), c->get_stride(), + a->get_slice_size(), a->get_const_slice_sets(), + as_hip_type(alpha->get_const_values()), + as_hip_type(a->get_const_values()), a->get_const_col_idxs(), + as_hip_type(b->get_const_values()), + as_hip_type(beta->get_const_values()), + as_hip_type(c->get_values())); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( diff --git a/hip/multigrid/amgx_pgm_kernels.hip.cpp b/hip/multigrid/amgx_pgm_kernels.hip.cpp index a0a16da9659..1c0444a3f50 100644 --- a/hip/multigrid/amgx_pgm_kernels.hip.cpp +++ b/hip/multigrid/amgx_pgm_kernels.hip.cpp @@ -82,9 +82,11 @@ void match_edge(std::shared_ptr exec, { const auto num = agg.get_num_elems(); const auto grid = ceildiv(num, default_block_size); - hipLaunchKernelGGL(kernel::match_edge_kernel, grid, default_block_size, 0, - 0, num, strongest_neighbor.get_const_data(), - agg.get_data()); + if (grid > 0) { + hipLaunchKernelGGL(kernel::match_edge_kernel, grid, default_block_size, + 0, 0, num, strongest_neighbor.get_const_data(), + agg.get_data()); + } } GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_AMGX_PGM_MATCH_EDGE_KERNEL); @@ -96,9 +98,11 @@ void count_unagg(std::shared_ptr exec, { Array active_agg(exec, agg.get_num_elems()); const auto grid = ceildiv(active_agg.get_num_elems(), default_block_size); - hipLaunchKernelGGL(kernel::activate_kernel, grid, default_block_size, 0, 0, - active_agg.get_num_elems(), agg.get_const_data(), - active_agg.get_data()); + if (grid > 0) { + hipLaunchKernelGGL(kernel::activate_kernel, grid, default_block_size, 0, + 0, active_agg.get_num_elems(), agg.get_const_data(), + active_agg.get_data()); + } *num_unagg = reduce_add_array(exec, active_agg.get_num_elems(), active_agg.get_const_data()); } @@ -113,11 +117,15 @@ void renumber(std::shared_ptr exec, Array& agg, const auto num = agg.get_num_elems(); Array agg_map(exec, num + 1); const auto grid = ceildiv(num, default_block_size); - hipLaunchKernelGGL(kernel::fill_agg_kernel, grid, default_block_size, 0, 0, - num, agg.get_const_data(), agg_map.get_data()); + if (grid > 0) { + hipLaunchKernelGGL(kernel::fill_agg_kernel, grid, default_block_size, 0, + 0, num, agg.get_const_data(), agg_map.get_data()); + } components::prefix_sum(exec, agg_map.get_data(), agg_map.get_num_elems()); - hipLaunchKernelGGL(kernel::renumber_kernel, grid, default_block_size, 0, 0, - num, agg_map.get_const_data(), agg.get_data()); + if (grid > 0) { + hipLaunchKernelGGL(kernel::renumber_kernel, grid, default_block_size, 0, + 0, num, agg_map.get_const_data(), agg.get_data()); + } *num_agg = exec->copy_val_to_host(agg_map.get_const_data() + num); } @@ -133,11 +141,14 @@ void find_strongest_neighbor( { const auto num = agg.get_num_elems(); const auto grid = ceildiv(num, default_block_size); - hipLaunchKernelGGL( - kernel::find_strongest_neighbor_kernel, grid, default_block_size, 0, 0, - num, weight_mtx->get_const_row_ptrs(), weight_mtx->get_const_col_idxs(), - weight_mtx->get_const_values(), diag->get_const_values(), - agg.get_data(), strongest_neighbor.get_data()); + if (grid > 0) { + hipLaunchKernelGGL( + kernel::find_strongest_neighbor_kernel, grid, default_block_size, 0, + 0, num, weight_mtx->get_const_row_ptrs(), + weight_mtx->get_const_col_idxs(), weight_mtx->get_const_values(), + diag->get_const_values(), agg.get_data(), + strongest_neighbor.get_data()); + } } GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE( @@ -154,23 +165,26 @@ void assign_to_exist_agg(std::shared_ptr exec, const auto num = agg.get_num_elems(); const auto grid = ceildiv(num, default_block_size); - if (intermediate_agg.get_num_elems() > 0) { - // determinstic kernel - hipLaunchKernelGGL( - kernel::assign_to_exist_agg_kernel, grid, default_block_size, 0, 0, - num, weight_mtx->get_const_row_ptrs(), - weight_mtx->get_const_col_idxs(), weight_mtx->get_const_values(), - diag->get_const_values(), agg.get_const_data(), - intermediate_agg.get_data()); - // Copy the intermediate_agg to agg - agg = intermediate_agg; - } else { - // undeterminstic kernel - hipLaunchKernelGGL( - kernel::assign_to_exist_agg_kernel, grid, default_block_size, 0, 0, - num, weight_mtx->get_const_row_ptrs(), - weight_mtx->get_const_col_idxs(), weight_mtx->get_const_values(), - diag->get_const_values(), agg.get_data()); + if (grid > 0) { + if (intermediate_agg.get_num_elems() > 0) { + // determinstic kernel + hipLaunchKernelGGL( + kernel::assign_to_exist_agg_kernel, grid, default_block_size, 0, + 0, num, weight_mtx->get_const_row_ptrs(), + weight_mtx->get_const_col_idxs(), + weight_mtx->get_const_values(), diag->get_const_values(), + agg.get_const_data(), intermediate_agg.get_data()); + // Copy the intermediate_agg to agg + agg = intermediate_agg; + } else { + // undeterminstic kernel + hipLaunchKernelGGL(kernel::assign_to_exist_agg_kernel, grid, + default_block_size, 0, 0, num, + weight_mtx->get_const_row_ptrs(), + weight_mtx->get_const_col_idxs(), + weight_mtx->get_const_values(), + diag->get_const_values(), agg.get_data()); + } } } diff --git a/hip/preconditioner/isai_kernels.hip.cpp b/hip/preconditioner/isai_kernels.hip.cpp index e43e47f83ce..77090304654 100644 --- a/hip/preconditioner/isai_kernels.hip.cpp +++ b/hip/preconditioner/isai_kernels.hip.cpp @@ -85,24 +85,26 @@ void generate_tri_inverse(std::shared_ptr exec, const auto block = default_block_size; const auto grid = ceildiv(num_rows, block / subwarp_size); - if (lower) { - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - kernel::generate_l_inverse), - grid, block, 0, 0, static_cast(num_rows), - input->get_const_row_ptrs(), input->get_const_col_idxs(), - as_hip_type(input->get_const_values()), inverse->get_row_ptrs(), - inverse->get_col_idxs(), as_hip_type(inverse->get_values()), - excess_rhs_ptrs, excess_nz_ptrs); - } else { - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - kernel::generate_u_inverse), - grid, block, 0, 0, static_cast(num_rows), - input->get_const_row_ptrs(), input->get_const_col_idxs(), - as_hip_type(input->get_const_values()), inverse->get_row_ptrs(), - inverse->get_col_idxs(), as_hip_type(inverse->get_values()), - excess_rhs_ptrs, excess_nz_ptrs); + if (grid > 0) { + if (lower) { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::generate_l_inverse), + grid, block, 0, 0, static_cast(num_rows), + input->get_const_row_ptrs(), input->get_const_col_idxs(), + as_hip_type(input->get_const_values()), inverse->get_row_ptrs(), + inverse->get_col_idxs(), as_hip_type(inverse->get_values()), + excess_rhs_ptrs, excess_nz_ptrs); + } else { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::generate_u_inverse), + grid, block, 0, 0, static_cast(num_rows), + input->get_const_row_ptrs(), input->get_const_col_idxs(), + as_hip_type(input->get_const_values()), inverse->get_row_ptrs(), + inverse->get_col_idxs(), as_hip_type(inverse->get_values()), + excess_rhs_ptrs, excess_nz_ptrs); + } } components::prefix_sum(exec, excess_rhs_ptrs, num_rows + 1); components::prefix_sum(exec, excess_nz_ptrs, num_rows + 1); @@ -123,14 +125,17 @@ void generate_general_inverse(std::shared_ptr exec, const auto block = default_block_size; const auto grid = ceildiv(num_rows, block / subwarp_size); - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - kernel::generate_general_inverse), - grid, block, 0, 0, static_cast(num_rows), - input->get_const_row_ptrs(), input->get_const_col_idxs(), - as_hip_type(input->get_const_values()), inverse->get_row_ptrs(), - inverse->get_col_idxs(), as_hip_type(inverse->get_values()), - excess_rhs_ptrs, excess_nz_ptrs, spd); + if (grid > 0) { + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + kernel::generate_general_inverse), + grid, block, 0, 0, static_cast(num_rows), + input->get_const_row_ptrs(), input->get_const_col_idxs(), + as_hip_type(input->get_const_values()), inverse->get_row_ptrs(), + inverse->get_col_idxs(), as_hip_type(inverse->get_values()), + excess_rhs_ptrs, excess_nz_ptrs, spd); + } components::prefix_sum(exec, excess_rhs_ptrs, num_rows + 1); components::prefix_sum(exec, excess_nz_ptrs, num_rows + 1); } @@ -153,15 +158,18 @@ void generate_excess_system(std::shared_ptr exec, const auto block = default_block_size; const auto grid = ceildiv(e_end - e_start, block / subwarp_size); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel::generate_excess_system), grid, - block, 0, 0, static_cast(num_rows), - input->get_const_row_ptrs(), input->get_const_col_idxs(), - as_hip_type(input->get_const_values()), inverse->get_const_row_ptrs(), - inverse->get_const_col_idxs(), excess_rhs_ptrs, excess_nz_ptrs, - excess_system->get_row_ptrs(), excess_system->get_col_idxs(), - as_hip_type(excess_system->get_values()), - as_hip_type(excess_rhs->get_values()), e_start, e_end); + if (grid > 0) { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::generate_excess_system), grid, + block, 0, 0, static_cast(num_rows), + input->get_const_row_ptrs(), input->get_const_col_idxs(), + as_hip_type(input->get_const_values()), + inverse->get_const_row_ptrs(), inverse->get_const_col_idxs(), + excess_rhs_ptrs, excess_nz_ptrs, excess_system->get_row_ptrs(), + excess_system->get_col_idxs(), + as_hip_type(excess_system->get_values()), + as_hip_type(excess_rhs->get_values()), e_start, e_end); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -176,10 +184,12 @@ void scale_excess_solution(std::shared_ptr, { const auto block = default_block_size; const auto grid = ceildiv(e_end - e_start, block / subwarp_size); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel::scale_excess_solution), grid, - block, 0, 0, excess_block_ptrs, - as_hip_type(excess_solution->get_values()), e_start, e_end); + if (grid > 0) { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::scale_excess_solution), grid, + block, 0, 0, excess_block_ptrs, + as_hip_type(excess_solution->get_values()), e_start, e_end); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -197,12 +207,14 @@ void scatter_excess_solution(std::shared_ptr exec, const auto block = default_block_size; const auto grid = ceildiv(e_end - e_start, block / subwarp_size); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel::copy_excess_solution), grid, - block, 0, 0, static_cast(num_rows), - inverse->get_const_row_ptrs(), excess_rhs_ptrs, - as_hip_type(excess_solution->get_const_values()), - as_hip_type(inverse->get_values()), e_start, e_end); + if (grid > 0) { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::copy_excess_solution), grid, + block, 0, 0, static_cast(num_rows), + inverse->get_const_row_ptrs(), excess_rhs_ptrs, + as_hip_type(excess_solution->get_const_values()), + as_hip_type(inverse->get_values()), e_start, e_end); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( diff --git a/hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp b/hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp index f347ba00363..87042e35332 100644 --- a/hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp +++ b/hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp @@ -92,21 +92,24 @@ void advanced_apply( ceildiv(num_blocks, warps_per_block * blocks_per_warp); const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block); - if (block_precisions) { - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - kernel::advanced_adaptive_apply), - grid_size, block_size, 0, 0, as_hip_type(blocks), storage_scheme, - block_precisions, block_pointers, num_blocks, as_hip_type(alpha), - as_hip_type(b), b_stride, as_hip_type(x), x_stride); - } else { - hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel::advanced_apply), - grid_size, block_size, 0, 0, as_hip_type(blocks), storage_scheme, - block_pointers, num_blocks, as_hip_type(alpha), as_hip_type(b), - b_stride, as_hip_type(x), x_stride); + if (grid_size > 0) { + if (block_precisions) { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::advanced_adaptive_apply< + max_block_size, subwarp_size, warps_per_block>), + grid_size, block_size, 0, 0, as_hip_type(blocks), + storage_scheme, block_precisions, block_pointers, num_blocks, + as_hip_type(alpha), as_hip_type(b), b_stride, as_hip_type(x), + x_stride); + } else { + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + kernel::advanced_apply), + grid_size, block_size, 0, 0, as_hip_type(blocks), + storage_scheme, block_pointers, num_blocks, as_hip_type(alpha), + as_hip_type(b), b_stride, as_hip_type(x), x_stride); + } } } diff --git a/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp b/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp index 37a6c027438..44ca906769f 100644 --- a/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp +++ b/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp @@ -92,24 +92,27 @@ void generate(syn::value_list, ceildiv(num_blocks, warps_per_block * blocks_per_warp); const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block); - if (block_precisions) { - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - kernel::adaptive_generate), - grid_size, block_size, 0, 0, mtx->get_size()[0], - mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), - as_hip_type(mtx->get_const_values()), as_hip_type(accuracy), - as_hip_type(block_data), storage_scheme, as_hip_type(conditioning), - block_precisions, block_ptrs, num_blocks); - } else { - hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel::generate), - grid_size, block_size, 0, 0, mtx->get_size()[0], - mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), - as_hip_type(mtx->get_const_values()), as_hip_type(block_data), - storage_scheme, block_ptrs, num_blocks); + if (grid_size > 0) { + if (block_precisions) { + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + kernel::adaptive_generate), + grid_size, block_size, 0, 0, mtx->get_size()[0], + mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), + as_hip_type(mtx->get_const_values()), as_hip_type(accuracy), + as_hip_type(block_data), storage_scheme, + as_hip_type(conditioning), block_precisions, block_ptrs, + num_blocks); + } else { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::generate), + grid_size, block_size, 0, 0, mtx->get_size()[0], + mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), + as_hip_type(mtx->get_const_values()), as_hip_type(block_data), + storage_scheme, block_ptrs, num_blocks); + } } } diff --git a/hip/preconditioner/jacobi_kernels.hip.cpp b/hip/preconditioner/jacobi_kernels.hip.cpp index ed374915cc7..d8c365fa12d 100644 --- a/hip/preconditioner/jacobi_kernels.hip.cpp +++ b/hip/preconditioner/jacobi_kernels.hip.cpp @@ -84,15 +84,18 @@ size_type find_natural_blocks(std::shared_ptr exec, { Array nums(exec, 1); + // FIXME: num_rows == 0 bug Array matching_next_row(exec, mtx->get_size()[0] - 1); const auto block_size = config::warp_size; const auto grid_size = ceildiv(mtx->get_size()[0] * config::warp_size, block_size); - hipLaunchKernelGGL(compare_adjacent_rows, grid_size, block_size, 0, 0, - mtx->get_size()[0], max_block_size, - mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), - matching_next_row.get_data()); + if (grid_size > 0) { + hipLaunchKernelGGL(compare_adjacent_rows, grid_size, block_size, 0, 0, + mtx->get_size()[0], max_block_size, + mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), + matching_next_row.get_data()); + } hipLaunchKernelGGL(generate_natural_block_pointer, 1, 1, 0, 0, mtx->get_size()[0], max_block_size, matching_next_row.get_const_data(), block_ptrs, @@ -129,10 +132,12 @@ void initialize_precisions(std::shared_ptr exec, const auto grid_size = min( default_grid_size, static_cast(ceildiv(precisions.get_num_elems(), block_size))); - hipLaunchKernelGGL(HIP_KERNEL_NAME(duplicate_array), - grid_size, block_size, 0, 0, source.get_const_data(), - source.get_num_elems(), precisions.get_data(), - precisions.get_num_elems()); + if (grid_size > 0) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(duplicate_array), + grid_size, block_size, 0, 0, source.get_const_data(), + source.get_num_elems(), precisions.get_data(), + precisions.get_num_elems()); + } } @@ -171,20 +176,24 @@ void transpose_jacobi( ceildiv(num_blocks, warps_per_block * blocks_per_warp); const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block); - if (block_precisions) { - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - adaptive_transpose_jacobi), - grid_size, block_size, 0, 0, as_hip_type(blocks), storage_scheme, - block_precisions, block_pointers, num_blocks, - as_hip_type(out_blocks)); - } else { - hipLaunchKernelGGL( - HIP_KERNEL_NAME(transpose_jacobi), - grid_size, block_size, 0, 0, as_hip_type(blocks), storage_scheme, - block_pointers, num_blocks, as_hip_type(out_blocks)); + if (grid_size > 0) { + if (block_precisions) { + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + adaptive_transpose_jacobi), + grid_size, block_size, 0, 0, as_hip_type(blocks), + storage_scheme, block_precisions, block_pointers, num_blocks, + as_hip_type(out_blocks)); + } else { + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + transpose_jacobi), + grid_size, block_size, 0, 0, as_hip_type(blocks), + storage_scheme, block_pointers, num_blocks, + as_hip_type(out_blocks)); + } } } diff --git a/hip/preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp b/hip/preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp index 1fa2f262fbf..c4a63007798 100644 --- a/hip/preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp +++ b/hip/preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp @@ -88,20 +88,23 @@ void apply(syn::value_list, size_type num_blocks, ceildiv(num_blocks, warps_per_block * blocks_per_warp); const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block); - if (block_precisions) { - hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel::adaptive_apply), - grid_size, block_size, 0, 0, as_hip_type(blocks), storage_scheme, - block_precisions, block_pointers, num_blocks, as_hip_type(b), - b_stride, as_hip_type(x), x_stride); - } else { - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - kernel::apply), - grid_size, block_size, 0, 0, as_hip_type(blocks), storage_scheme, - block_pointers, num_blocks, as_hip_type(b), b_stride, - as_hip_type(x), x_stride); + if (grid_size > 0) { + if (block_precisions) { + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + kernel::adaptive_apply), + grid_size, block_size, 0, 0, as_hip_type(blocks), + storage_scheme, block_precisions, block_pointers, num_blocks, + as_hip_type(b), b_stride, as_hip_type(x), x_stride); + } else { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::apply), + grid_size, block_size, 0, 0, as_hip_type(blocks), + storage_scheme, block_pointers, num_blocks, as_hip_type(b), + b_stride, as_hip_type(x), x_stride); + } } } diff --git a/hip/solver/multigrid_kernels.hip.cpp b/hip/solver/multigrid_kernels.hip.cpp index f70c4d7e38f..c70a2793af0 100644 --- a/hip/solver/multigrid_kernels.hip.cpp +++ b/hip/solver/multigrid_kernels.hip.cpp @@ -84,13 +84,15 @@ void kcycle_step_1(std::shared_ptr exec, const size_type grid_nrows = max_size / nrhs < nrows ? max_size / nrhs : nrows; const auto grid = ceildiv(grid_nrows * nrhs, default_block_size); - hipLaunchKernelGGL( - kernel::kcycle_step_1_kernel, grid, default_block_size, 0, 0, nrows, - nrhs, e->get_stride(), grid_nrows, - as_hip_type(alpha->get_const_values()), - as_hip_type(rho->get_const_values()), - as_hip_type(v->get_const_values()), as_hip_type(g->get_values()), - as_hip_type(d->get_values()), as_hip_type(e->get_values())); + if (grid > 0) { + hipLaunchKernelGGL( + kernel::kcycle_step_1_kernel, grid, default_block_size, 0, 0, nrows, + nrhs, e->get_stride(), grid_nrows, + as_hip_type(alpha->get_const_values()), + as_hip_type(rho->get_const_values()), + as_hip_type(v->get_const_values()), as_hip_type(g->get_values()), + as_hip_type(d->get_values()), as_hip_type(e->get_values())); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_MULTIGRID_KCYCLE_STEP_1_KERNEL); @@ -112,15 +114,17 @@ void kcycle_step_2(std::shared_ptr exec, const size_type grid_nrows = max_size / nrhs < nrows ? max_size / nrhs : nrows; const auto grid = ceildiv(grid_nrows * nrhs, default_block_size); - hipLaunchKernelGGL(kernel::kcycle_step_2_kernel, grid, default_block_size, - 0, 0, nrows, nrhs, e->get_stride(), grid_nrows, - as_hip_type(alpha->get_const_values()), - as_hip_type(rho->get_const_values()), - as_hip_type(gamma->get_const_values()), - as_hip_type(beta->get_const_values()), - as_hip_type(zeta->get_const_values()), - as_hip_type(d->get_const_values()), - as_hip_type(e->get_values())); + if (grid > 0) { + hipLaunchKernelGGL( + kernel::kcycle_step_2_kernel, grid, default_block_size, 0, 0, nrows, + nrhs, e->get_stride(), grid_nrows, + as_hip_type(alpha->get_const_values()), + as_hip_type(rho->get_const_values()), + as_hip_type(gamma->get_const_values()), + as_hip_type(beta->get_const_values()), + as_hip_type(zeta->get_const_values()), + as_hip_type(d->get_const_values()), as_hip_type(e->get_values())); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_MULTIGRID_KCYCLE_STEP_2_KERNEL); @@ -137,11 +141,13 @@ void kcycle_check_stop(std::shared_ptr exec, true); const auto nrhs = new_norm->get_size()[1]; const auto grid = ceildiv(nrhs, default_block_size); - hipLaunchKernelGGL(kernel::kcycle_check_stop_kernel, grid, - default_block_size, 0, 0, nrhs, - as_hip_type(old_norm->get_const_values()), - as_hip_type(new_norm->get_const_values()), rel_tol, - as_hip_type(dis_stop.get_data())); + if (grid > 0) { + hipLaunchKernelGGL(kernel::kcycle_check_stop_kernel, grid, + default_block_size, 0, 0, nrhs, + as_hip_type(old_norm->get_const_values()), + as_hip_type(new_norm->get_const_values()), rel_tol, + as_hip_type(dis_stop.get_data())); + } is_stop = exec->copy_val_to_host(dis_stop.get_const_data()); } diff --git a/hip/stop/criterion_kernels.hip.cpp b/hip/stop/criterion_kernels.hip.cpp index af2717a658d..1856a6c9412 100644 --- a/hip/stop/criterion_kernels.hip.cpp +++ b/hip/stop/criterion_kernels.hip.cpp @@ -74,9 +74,11 @@ void set_all_statuses(std::shared_ptr exec, uint8 stoppingId, const auto block_size = default_block_size; const auto grid_size = ceildiv(stop_status->get_num_elems(), block_size); - hipLaunchKernelGGL((set_all_statuses), grid_size, block_size, 0, 0, - stop_status->get_num_elems(), stoppingId, setFinalized, - as_hip_type(stop_status->get_data())); + if (grid_size > 0) { + hipLaunchKernelGGL((set_all_statuses), grid_size, block_size, 0, 0, + stop_status->get_num_elems(), stoppingId, + setFinalized, as_hip_type(stop_status->get_data())); + } } diff --git a/hip/stop/residual_norm_kernels.hip.cpp b/hip/stop/residual_norm_kernels.hip.cpp index 5925d7c6d9a..68a3041cd72 100644 --- a/hip/stop/residual_norm_kernels.hip.cpp +++ b/hip/stop/residual_norm_kernels.hip.cpp @@ -108,12 +108,15 @@ void residual_norm(std::shared_ptr exec, const auto block_size = default_block_size; const auto grid_size = ceildiv(tau->get_size()[1], block_size); - hipLaunchKernelGGL((residual_norm_kernel), grid_size, block_size, 0, 0, - tau->get_size()[1], rel_residual_goal, - as_hip_type(tau->get_const_values()), - as_hip_type(orig_tau->get_const_values()), stoppingId, - setFinalized, as_hip_type(stop_status->get_data()), - as_hip_type(device_storage->get_data())); + if (grid_size > 0) { + hipLaunchKernelGGL((residual_norm_kernel), grid_size, block_size, 0, 0, + tau->get_size()[1], rel_residual_goal, + as_hip_type(tau->get_const_values()), + as_hip_type(orig_tau->get_const_values()), + stoppingId, setFinalized, + as_hip_type(stop_status->get_data()), + as_hip_type(device_storage->get_data())); + } /* Represents all_converged, one_changed */ *all_converged = exec->copy_val_to_host(device_storage->get_const_data()); @@ -186,12 +189,15 @@ void implicit_residual_norm( const auto block_size = default_block_size; const auto grid_size = ceildiv(tau->get_size()[1], block_size); - hipLaunchKernelGGL((implicit_residual_norm_kernel), grid_size, block_size, - 0, 0, tau->get_size()[1], rel_residual_goal, - as_hip_type(tau->get_const_values()), - as_hip_type(orig_tau->get_const_values()), stoppingId, - setFinalized, as_hip_type(stop_status->get_data()), - as_hip_type(device_storage->get_data())); + if (grid_size > 0) { + hipLaunchKernelGGL( + (implicit_residual_norm_kernel), grid_size, block_size, 0, 0, + tau->get_size()[1], rel_residual_goal, + as_hip_type(tau->get_const_values()), + as_hip_type(orig_tau->get_const_values()), stoppingId, setFinalized, + as_hip_type(stop_status->get_data()), + as_hip_type(device_storage->get_data())); + } /* Represents all_converged, one_changed */ *all_converged = exec->copy_val_to_host(device_storage->get_const_data()); diff --git a/hip/test/matrix/csr_kernels.hip.cpp b/hip/test/matrix/csr_kernels.hip.cpp index 3e03cc5cd1e..bdf89007b44 100644 --- a/hip/test/matrix/csr_kernels.hip.cpp +++ b/hip/test/matrix/csr_kernels.hip.cpp @@ -663,21 +663,6 @@ TEST_F(Csr, ConvertsEmptyToSellp) } -TEST_F(Csr, CalculatesNonzerosPerRow) -{ - set_up_apply_data(std::make_shared()); - gko::Array row_nnz(ref, mtx->get_size()[0]); - gko::Array drow_nnz(hip, dmtx->get_size()[0]); - - gko::kernels::reference::csr::count_nonzeros_per_row(ref, mtx.get(), - row_nnz.get_data()); - gko::kernels::hip::csr::count_nonzeros_per_row(hip, dmtx.get(), - drow_nnz.get_data()); - - GKO_ASSERT_ARRAY_EQ(row_nnz, drow_nnz); -} - - TEST_F(Csr, ConvertToHybridIsEquivalentToRef) { using Hybrid_type = gko::matrix::Hybrid<>; diff --git a/include/ginkgo/core/matrix/coo.hpp b/include/ginkgo/core/matrix/coo.hpp index 8bda4e728c8..c8de479999f 100644 --- a/include/ginkgo/core/matrix/coo.hpp +++ b/include/ginkgo/core/matrix/coo.hpp @@ -338,6 +338,13 @@ class Coo : public EnableLinOp>, GKO_ASSERT_EQ(values_.get_num_elems(), row_idxs_.get_num_elems()); } + /** + * Resizes the matrix and associated storage to the given sizes. + * Internal storage may be reallocated if they don't match the old values. + * + * @param new_size the new matrix dimensions. + * @param nnz the new number of nonzeros. + */ void resize(dim<2> new_size, size_type nnz); void apply_impl(const LinOp* b, LinOp* x) const override; diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp index d9545099d84..f640644ba05 100644 --- a/include/ginkgo/core/matrix/dense.hpp +++ b/include/ginkgo/core/matrix/dense.hpp @@ -1030,6 +1030,8 @@ class Dense * * If the new size matches the current size, the stride will be left * unchanged, otherwise it will be set to the number of columns. + * + * @param new_size the new matrix dimensions */ void resize(gko::dim<2> new_size); diff --git a/include/ginkgo/core/matrix/ell.hpp b/include/ginkgo/core/matrix/ell.hpp index 5c82df72de1..c0b6896c1be 100644 --- a/include/ginkgo/core/matrix/ell.hpp +++ b/include/ginkgo/core/matrix/ell.hpp @@ -350,6 +350,15 @@ class Ell : public EnableLinOp>, col_idxs_.get_num_elems()); } + /** + * Resizes the matrix to the given dimensions and row nonzero count. + * If the dimensions or row nonzero count don't match their old values, + * the column stride will be reset to the number of rows and the internal + * storage reallocated to match these values. + * + * @param new_size the new matrix dimensions + * @param max_row_nnz the new number of nonzeros per row + */ void resize(dim<2> new_size, size_type max_row_nnz); void apply_impl(const LinOp* b, LinOp* x) const override; diff --git a/include/ginkgo/core/matrix/hybrid.hpp b/include/ginkgo/core/matrix/hybrid.hpp index 6c85151f14b..707d22ac68d 100644 --- a/include/ginkgo/core/matrix/hybrid.hpp +++ b/include/ginkgo/core/matrix/hybrid.hpp @@ -736,6 +736,16 @@ class Hybrid strategy_(std::move(strategy)) {} + /** + * Resizes the matrix to the given dimensions and storage sizes. + * + * @param new_size the new matrix dimensions + * @param ell_row_nnz the number of non-zeros per row stored in ELL + * @param coo_nnz the number of non-zeros stored in COO + * + * @see Ell::resize(dim<2>, size_type) + * @see Coo::resize(dim<2>, size_type) + */ void resize(dim<2> new_size, size_type ell_row_nnz, size_type coo_nnz); void apply_impl(const LinOp* b, LinOp* x) const override; diff --git a/omp/test/matrix/csr_kernels.cpp b/omp/test/matrix/csr_kernels.cpp index c82392c156b..6780cc8297a 100644 --- a/omp/test/matrix/csr_kernels.cpp +++ b/omp/test/matrix/csr_kernels.cpp @@ -479,21 +479,6 @@ TEST_F(Csr, MoveToSparsityCsrIsEquivalentToRef) } -TEST_F(Csr, CalculatesNonzerosPerRow) -{ - set_up_apply_data(); - gko::Array row_nnz(ref, mtx->get_size()[0]); - gko::Array drow_nnz(omp, dmtx->get_size()[0]); - - gko::kernels::reference::csr::count_nonzeros_per_row(ref, mtx.get(), - row_nnz.get_data()); - gko::kernels::omp::csr::count_nonzeros_per_row(omp, dmtx.get(), - drow_nnz.get_data()); - - GKO_ASSERT_ARRAY_EQ(row_nnz, drow_nnz); -} - - TEST_F(Csr, ConvertToHybridIsEquivalentToRef) { using Hybrid_type = gko::matrix::Hybrid<>; diff --git a/reference/components/format_conversion_kernels.cpp b/reference/components/format_conversion_kernels.cpp index 47ff5a2e9ff..8ab1d8d071a 100644 --- a/reference/components/format_conversion_kernels.cpp +++ b/reference/components/format_conversion_kernels.cpp @@ -80,18 +80,17 @@ GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CONVERT_IDXS_TO_PTRS32); GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CONVERT_IDXS_TO_PTRS64); -template +template void convert_ptrs_to_sizes(std::shared_ptr exec, const RowPtrType* ptrs, size_type num_blocks, - IndexType* sizes) + size_type* sizes) { for (size_type block = 0; block < num_blocks; block++) { sizes[block] = ptrs[block + 1] - ptrs[block]; } } -GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CONVERT_PTRS_TO_SIZES32); -GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CONVERT_PTRS_TO_SIZES64); +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CONVERT_PTRS_TO_SIZES); } // namespace components diff --git a/reference/matrix/csr_kernels.cpp b/reference/matrix/csr_kernels.cpp index ba96df7fe64..7ce43de53ed 100644 --- a/reference/matrix/csr_kernels.cpp +++ b/reference/matrix/csr_kernels.cpp @@ -881,21 +881,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_INVERSE_COLUMN_PERMUTE_KERNEL); -template -void count_nonzeros_per_row(std::shared_ptr exec, - const matrix::Csr* source, - size_type* result) -{ - const auto row_ptrs = source->get_const_row_ptrs(); - for (size_type i = 0; i < source->get_size()[0]; i++) { - result[i] = row_ptrs[i + 1] - row_ptrs[i]; - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_COUNT_NONZEROS_PER_ROW_KERNEL); - - template void sort_by_column_index(std::shared_ptr exec, matrix::Csr* to_sort) diff --git a/reference/test/components/CMakeLists.txt b/reference/test/components/CMakeLists.txt index 48e35aba6dd..e9737f5c106 100644 --- a/reference/test/components/CMakeLists.txt +++ b/reference/test/components/CMakeLists.txt @@ -1,5 +1,6 @@ ginkgo_create_test(absolute_array_kernels) ginkgo_create_test(fill_array_kernels) +ginkgo_create_test(format_conversion_kernels) ginkgo_create_test(precision_conversion_kernels) ginkgo_create_test(prefix_sum_kernels) ginkgo_create_test(reduce_array_kernels) diff --git a/reference/test/components/format_conversion_kernels.cpp b/reference/test/components/format_conversion_kernels.cpp new file mode 100644 index 00000000000..d308f315d64 --- /dev/null +++ b/reference/test/components/format_conversion_kernels.cpp @@ -0,0 +1,128 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/components/format_conversion_kernels.hpp" + + +#include +#include +#include +#include +#include + + +#include + + +#include "core/test/utils.hpp" + + +namespace { + + +template +class FormatConversion : public ::testing::Test { +protected: + FormatConversion() : ref(gko::ReferenceExecutor::create()) {} + + std::shared_ptr ref; +}; + +TYPED_TEST_SUITE(FormatConversion, gko::test::IndexTypes, + TypenameNameGenerator); + + +TYPED_TEST(FormatConversion, ConvertsEmptyPtrsToIdxs) +{ + std::vector ptrs(10); + TypeParam* out = nullptr; + + gko::kernels::reference::components::convert_ptrs_to_idxs( + this->ref, ptrs.data(), 9, out); + + // mustn't segfault +} + + +TYPED_TEST(FormatConversion, ConvertsPtrsToIdxs) +{ + std::vector ptrs{0, 1, 3, 5, 5}; + std::vector idxs(5); + std::vector reference{0, 1, 1, 2, 2}; + + gko::kernels::reference::components::convert_ptrs_to_idxs( + this->ref, ptrs.data(), 4, idxs.data()); + + ASSERT_EQ(idxs, reference); +} + + +TYPED_TEST(FormatConversion, ConvertsEmptyIdxsToPtrs) +{ + std::vector idxs; + std::vector ptrs(10); + std::vector reference(10); + + gko::kernels::reference::components::convert_idxs_to_ptrs( + this->ref, idxs.data(), 0, 9, ptrs.data()); + + ASSERT_EQ(ptrs, reference); +} + + +TYPED_TEST(FormatConversion, ConvertsIdxsToPtrs) +{ + std::vector idxs{1, 1, 1, 2, 2, 4}; + std::vector ptrs(6); + std::vector reference{0, 0, 3, 5, 5, 6}; + + gko::kernels::reference::components::convert_idxs_to_ptrs( + this->ref, idxs.data(), 6, 5, ptrs.data()); + + ASSERT_EQ(ptrs, reference); +} + + +TYPED_TEST(FormatConversion, ConvertsPtrsToSizes) +{ + std::vector ptrs{0, 1, 3, 5, 5}; + std::vector sizes(4); + std::vector reference{1, 2, 2, 0}; + + gko::kernels::reference::components::convert_ptrs_to_sizes( + this->ref, ptrs.data(), 4, sizes.data()); + + ASSERT_EQ(sizes, reference); +} + + +} // namespace diff --git a/reference/test/matrix/csr_kernels.cpp b/reference/test/matrix/csr_kernels.cpp index b1161cff463..31a4c167b39 100644 --- a/reference/test/matrix/csr_kernels.cpp +++ b/reference/test/matrix/csr_kernels.cpp @@ -1009,19 +1009,6 @@ TYPED_TEST(Csr, MovesEmptyToHybrid) } -TYPED_TEST(Csr, CalculatesNonzerosPerRow) -{ - gko::Array row_nnz(this->exec, this->mtx->get_size()[0]); - - gko::kernels::reference::csr::count_nonzeros_per_row( - this->exec, this->mtx.get(), row_nnz.get_data()); - - auto row_nnz_val = row_nnz.get_data(); - ASSERT_EQ(row_nnz_val[0], 3); - ASSERT_EQ(row_nnz_val[1], 1); -} - - TYPED_TEST(Csr, ConvertsToEll) { using Ell = typename TestFixture::Ell; diff --git a/test/components/CMakeLists.txt b/test/components/CMakeLists.txt index ad8e81689ac..934445cf230 100644 --- a/test/components/CMakeLists.txt +++ b/test/components/CMakeLists.txt @@ -1,5 +1,6 @@ ginkgo_create_common_test(absolute_array_kernels) ginkgo_create_common_and_reference_test(device_matrix_data_kernels) ginkgo_create_common_test(fill_array_kernels) +ginkgo_create_common_test(format_conversion_kernels) ginkgo_create_common_test(precision_conversion_kernels) ginkgo_create_common_test(reduce_array_kernels) diff --git a/test/components/format_conversion_kernels.cpp b/test/components/format_conversion_kernels.cpp new file mode 100644 index 00000000000..0fc64f0feb3 --- /dev/null +++ b/test/components/format_conversion_kernels.cpp @@ -0,0 +1,168 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/components/format_conversion_kernels.hpp" + + +#include +#include +#include + + +#include + + +#include "core/test/utils.hpp" +#include "test/utils/executor.hpp" + +namespace { + + +template +class FormatConversion : public ::testing::Test { +protected: + FormatConversion() : rand(293), size(42793) {} + + void SetUp() + { + ref = gko::ReferenceExecutor::create(); + init_executor(ref, exec); + sizes.set_executor(ref); + ptrs.set_executor(ref); + idxs.set_executor(ref); + std::uniform_int_distribution row_dist{0, 10}; + sizes.resize_and_reset(size); + ptrs.resize_and_reset(size + 1); + ptrs.get_data()[0] = 0; + for (gko::size_type i = 0; i < size; i++) { + sizes.get_data()[i] = row_dist(rand); + ptrs.get_data()[i + 1] = ptrs.get_data()[i] + sizes.get_data()[i]; + } + idxs.resize_and_reset(ptrs.get_const_data()[size]); + for (gko::size_type i = 0; i < size; i++) { + auto begin = ptrs.get_const_data()[i]; + auto end = ptrs.get_const_data()[i + 1]; + for (auto j = begin; j < end; j++) { + idxs.get_data()[j] = i; + } + } + sizes.set_executor(exec); + ptrs.set_executor(exec); + idxs.set_executor(exec); + } + + void TearDown() + { + if (exec != nullptr) { + ASSERT_NO_THROW(exec->synchronize()); + } + } + + std::shared_ptr ref; + std::shared_ptr exec; + gko::size_type size; + std::default_random_engine rand; + gko::Array sizes; + gko::Array ptrs; + gko::Array idxs; +}; + +TYPED_TEST_SUITE(FormatConversion, gko::test::IndexTypes, + TypenameNameGenerator); + + +TYPED_TEST(FormatConversion, ConvertsEmptyPtrsToIdxs) +{ + gko::Array ptrs(this->exec, this->size + 1); + ptrs.fill(0); + TypeParam* output = nullptr; + + gko::kernels::EXEC_NAMESPACE::components::convert_ptrs_to_idxs( + this->exec, ptrs.get_const_data(), this->size, output); + + // mustn't segfault +} + + +TYPED_TEST(FormatConversion, ConvertPtrsToIdxs) +{ + auto ref = this->idxs; + this->idxs.fill(-1); + + gko::kernels::EXEC_NAMESPACE::components::convert_ptrs_to_idxs( + this->exec, this->ptrs.get_const_data(), this->size, + this->idxs.get_data()); + + GKO_ASSERT_ARRAY_EQ(this->idxs, ref); +} + + +TYPED_TEST(FormatConversion, ConvertsEmptyIdxsToPtrs) +{ + this->ptrs.fill(0); + auto ref = this->ptrs; + TypeParam* input = nullptr; + + gko::kernels::EXEC_NAMESPACE::components::convert_idxs_to_ptrs( + this->exec, input, 0, this->size, this->ptrs.get_data()); + + GKO_ASSERT_ARRAY_EQ(this->ptrs, ref); +} + + +TYPED_TEST(FormatConversion, ConvertIdxsToPtrsIsEquivalentToRef) +{ + auto ref = this->ptrs; + this->ptrs.fill(-1); + + gko::kernels::EXEC_NAMESPACE::components::convert_idxs_to_ptrs( + this->exec, this->idxs.get_const_data(), this->idxs.get_num_elems(), + this->size, this->ptrs.get_data()); + + GKO_ASSERT_ARRAY_EQ(this->ptrs, ref); +} + + +TYPED_TEST(FormatConversion, ConvertPtrsToSizesIsEquivalentToRef) +{ + auto ref = this->sizes; + this->sizes.fill(12345); + + gko::kernels::EXEC_NAMESPACE::components::convert_ptrs_to_sizes( + this->exec, this->ptrs.get_const_data(), this->size, + this->sizes.get_data()); + + GKO_ASSERT_ARRAY_EQ(this->sizes, ref); +} + + +} // namespace From b4cfc273ece314ad150d653ee6e295cbcbb65499 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 25 Jan 2022 19:12:48 +0100 Subject: [PATCH 27/32] simplify cuSPARSE type handling Co-authored-by: Pratik Nayak --- cuda/base/types.hpp | 66 +++++++++++++++------------------------------ 1 file changed, 22 insertions(+), 44 deletions(-) diff --git a/cuda/base/types.hpp b/cuda/base/types.hpp index 8c8df0c912b..eadf893c314 100644 --- a/cuda/base/types.hpp +++ b/cuda/base/types.hpp @@ -212,40 +212,21 @@ struct cuda_type_impl> { template struct cuda_data_type_impl {}; -template <> -struct cuda_data_type_impl { - constexpr static cudaDataType_t value = CUDA_R_16F; -}; - -template <> -struct cuda_data_type_impl { - constexpr static cudaDataType_t value = CUDA_R_32F; -}; - -template <> -struct cuda_data_type_impl { - constexpr static cudaDataType_t value = CUDA_R_64F; -}; - -template <> -struct cuda_data_type_impl> { - constexpr static cudaDataType_t value = CUDA_C_32F; -}; - -template <> -struct cuda_data_type_impl> { - constexpr static cudaDataType_t value = CUDA_C_64F; -}; +#define GKO_CUDA_DATA_TYPE(_type, _value) \ + template <> \ + struct cuda_data_type_impl<_type> { \ + constexpr static cudaDataType_t value = _value; \ + } -template <> -struct cuda_data_type_impl { - constexpr static cudaDataType_t value = CUDA_R_32I; -}; +GKO_CUDA_DATA_TYPE(float16, CUDA_R_16F); +GKO_CUDA_DATA_TYPE(float, CUDA_R_32F); +GKO_CUDA_DATA_TYPE(double, CUDA_R_64F); +GKO_CUDA_DATA_TYPE(std::complex, CUDA_C_32F); +GKO_CUDA_DATA_TYPE(std::complex, CUDA_C_64F); +GKO_CUDA_DATA_TYPE(int32, CUDA_R_32I); +GKO_CUDA_DATA_TYPE(int8, CUDA_R_8I); -template <> -struct cuda_data_type_impl { - constexpr static cudaDataType_t value = CUDA_R_8I; -}; +#undef GKO_CUDA_DATA_TYPE #if defined(CUDA_VERSION) && \ @@ -256,20 +237,17 @@ struct cuda_data_type_impl { template struct cusparse_index_type_impl {}; -template <> -struct cusparse_index_type_impl { - constexpr static cusparseIndexType_t value = CUSPARSE_INDEX_16U; -}; +#define GKO_CUDA_INDEX_TYPE(_type, _value) \ + template <> \ + struct cusparse_index_type_impl<_type> { \ + constexpr static cusparseIndexType_t value = _value; \ + } -template <> -struct cusparse_index_type_impl { - constexpr static cusparseIndexType_t value = CUSPARSE_INDEX_32I; -}; +GKO_CUDA_INDEX_TYPE(std::uint16_t, CUSPARSE_INDEX_16U); +GKO_CUDA_INDEX_TYPE(int32, CUSPARSE_INDEX_32I); +GKO_CUDA_INDEX_TYPE(int64, CUSPARSE_INDEX_64I); -template <> -struct cusparse_index_type_impl { - constexpr static cusparseIndexType_t value = CUSPARSE_INDEX_64I; -}; +#undef GKO_CUDA_INDEX_TYPE #endif // defined(CUDA_VERSION) && (CUDA_VERSION >= 11000 || ((CUDA_VERSION >= From 84ebf9de3286a4874ff43b6fc770be8475ebfd2d Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 25 Jan 2022 20:34:34 +0100 Subject: [PATCH 28/32] fix accessor compilation issues --- omp/matrix/dense_kernels.cpp | 5 +++-- omp/matrix/fbcsr_kernels.cpp | 2 +- reference/matrix/dense_kernels.cpp | 5 +++-- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/omp/matrix/dense_kernels.cpp b/omp/matrix/dense_kernels.cpp index 360f05448ee..56735caa436 100644 --- a/omp/matrix/dense_kernels.cpp +++ b/omp/matrix/dense_kernels.cpp @@ -239,8 +239,9 @@ void convert_to_fbcsr(std::shared_ptr exec, const auto num_block_rows = num_rows / bs; const auto num_block_cols = num_cols / bs; acc::range> blocks( - std::array{nzbs, static_cast(bs), - static_cast(bs)}, + std::array{static_cast(nzbs), + static_cast(bs), + static_cast(bs)}, result->get_values()); auto col_idxs = result->get_col_idxs(); #pragma omp parallel for diff --git a/omp/matrix/fbcsr_kernels.cpp b/omp/matrix/fbcsr_kernels.cpp index 4dcc13b2140..5e0b72715bd 100644 --- a/omp/matrix/fbcsr_kernels.cpp +++ b/omp/matrix/fbcsr_kernels.cpp @@ -220,7 +220,7 @@ void fill_in_dense(std::shared_ptr exec, auto row_ptrs = source->get_const_row_ptrs(); auto col_idxs = source->get_const_col_idxs(); const acc::range> values{ - to_std_array(nbnz, bs, bs), source->get_const_values()}; + to_std_array(nbnz, bs, bs), source->get_const_values()}; #pragma omp parallel for for (size_type block_row = 0; block_row < nbrows; block_row++) { const auto row_begin = row_ptrs[block_row]; diff --git a/reference/matrix/dense_kernels.cpp b/reference/matrix/dense_kernels.cpp index cd19aa5299e..799f46bcde8 100644 --- a/reference/matrix/dense_kernels.cpp +++ b/reference/matrix/dense_kernels.cpp @@ -477,8 +477,9 @@ void convert_to_fbcsr(std::shared_ptr exec, const auto num_block_rows = num_rows / bs; const auto num_block_cols = num_cols / bs; acc::range> blocks( - std::array{nzbs, static_cast(bs), - static_cast(bs)}, + std::array{static_cast(nzbs), + static_cast(bs), + static_cast(bs)}, result->get_values()); auto col_idxs = result->get_col_idxs(); for (size_type brow = 0; brow < num_block_rows; ++brow) { From b982ec6a67f76ca9622c15816edf89c4be6e5234 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sun, 30 Jan 2022 23:06:48 +0100 Subject: [PATCH 29/32] review updates * avoid unnecessary copy in cross-executor convert_to(Dense) * improve naming in format conversion tests * make format conversion tests more resilient Co-authored-by: Yuhsiang Tsai --- core/matrix/coo.cpp | 2 +- core/matrix/csr.cpp | 2 +- core/matrix/ell.cpp | 2 +- core/matrix/fbcsr.cpp | 2 +- core/matrix/sellp.cpp | 2 +- core/matrix/sparsity_csr.cpp | 2 +- test/components/format_conversion_kernels.cpp | 17 +++++++++-------- 7 files changed, 15 insertions(+), 14 deletions(-) diff --git a/core/matrix/coo.cpp b/core/matrix/coo.cpp index 4163be966c2..0b5781e7a5d 100644 --- a/core/matrix/coo.cpp +++ b/core/matrix/coo.cpp @@ -185,7 +185,7 @@ void Coo::convert_to(Dense* result) const result->resize(this->get_size()); result->fill(zero()); exec->run(coo::make_fill_in_dense( - this, make_temporary_clone(exec, result).get())); + this, make_temporary_output_clone(exec, result).get())); } diff --git a/core/matrix/csr.cpp b/core/matrix/csr.cpp index ed75ed45287..54c735f89b6 100644 --- a/core/matrix/csr.cpp +++ b/core/matrix/csr.cpp @@ -214,7 +214,7 @@ void Csr::convert_to(Dense* result) const result->resize(this->get_size()); result->fill(zero()); exec->run(csr::make_fill_in_dense( - this, make_temporary_clone(exec, result).get())); + this, make_temporary_output_clone(exec, result).get())); } diff --git a/core/matrix/ell.cpp b/core/matrix/ell.cpp index bc6a2721158..684fb7e627d 100644 --- a/core/matrix/ell.cpp +++ b/core/matrix/ell.cpp @@ -135,7 +135,7 @@ void Ell::convert_to(Dense* result) const result->resize(this->get_size()); result->fill(zero()); exec->run(ell::make_fill_in_dense( - this, make_temporary_clone(exec, result).get())); + this, make_temporary_output_clone(exec, result).get())); } diff --git a/core/matrix/fbcsr.cpp b/core/matrix/fbcsr.cpp index 8d54f480036..688d7aa4b9f 100644 --- a/core/matrix/fbcsr.cpp +++ b/core/matrix/fbcsr.cpp @@ -158,7 +158,7 @@ void Fbcsr::convert_to( result->resize(this->get_size()); result->fill(zero()); exec->run(fbcsr::make_fill_in_dense( - this, make_temporary_clone(exec, result).get())); + this, make_temporary_output_clone(exec, result).get())); } diff --git a/core/matrix/sellp.cpp b/core/matrix/sellp.cpp index d2db34d3736..b5e70c23647 100644 --- a/core/matrix/sellp.cpp +++ b/core/matrix/sellp.cpp @@ -130,7 +130,7 @@ void Sellp::convert_to(Dense* result) const result->resize(this->get_size()); result->fill(zero()); exec->run(sellp::make_fill_in_dense( - this, make_temporary_clone(exec, result).get())); + this, make_temporary_output_clone(exec, result).get())); } diff --git a/core/matrix/sparsity_csr.cpp b/core/matrix/sparsity_csr.cpp index 6cb9b598403..8d32985abf1 100644 --- a/core/matrix/sparsity_csr.cpp +++ b/core/matrix/sparsity_csr.cpp @@ -127,7 +127,7 @@ void SparsityCsr::convert_to( Dense* result) const { auto exec = this->get_executor(); - auto tmp = make_temporary_clone(exec, result); + auto tmp = make_temporary_output_clone(exec, result); tmp->resize(this->get_size()); tmp->fill(zero()); exec->run(sparsity_csr::make_fill_in_dense(this, tmp.get())); diff --git a/test/components/format_conversion_kernels.cpp b/test/components/format_conversion_kernels.cpp index 0fc64f0feb3..2de5c5c9363 100644 --- a/test/components/format_conversion_kernels.cpp +++ b/test/components/format_conversion_kernels.cpp @@ -115,53 +115,54 @@ TYPED_TEST(FormatConversion, ConvertsEmptyPtrsToIdxs) TYPED_TEST(FormatConversion, ConvertPtrsToIdxs) { - auto ref = this->idxs; + auto ref_idxs = this->idxs; this->idxs.fill(-1); gko::kernels::EXEC_NAMESPACE::components::convert_ptrs_to_idxs( this->exec, this->ptrs.get_const_data(), this->size, this->idxs.get_data()); - GKO_ASSERT_ARRAY_EQ(this->idxs, ref); + GKO_ASSERT_ARRAY_EQ(this->idxs, ref_idxs); } TYPED_TEST(FormatConversion, ConvertsEmptyIdxsToPtrs) { this->ptrs.fill(0); - auto ref = this->ptrs; + auto ref_ptrs = this->ptrs; + this->ptrs.fill(-1); TypeParam* input = nullptr; gko::kernels::EXEC_NAMESPACE::components::convert_idxs_to_ptrs( this->exec, input, 0, this->size, this->ptrs.get_data()); - GKO_ASSERT_ARRAY_EQ(this->ptrs, ref); + GKO_ASSERT_ARRAY_EQ(this->ptrs, ref_ptrs); } TYPED_TEST(FormatConversion, ConvertIdxsToPtrsIsEquivalentToRef) { - auto ref = this->ptrs; + auto ref_ptrs = this->ptrs; this->ptrs.fill(-1); gko::kernels::EXEC_NAMESPACE::components::convert_idxs_to_ptrs( this->exec, this->idxs.get_const_data(), this->idxs.get_num_elems(), this->size, this->ptrs.get_data()); - GKO_ASSERT_ARRAY_EQ(this->ptrs, ref); + GKO_ASSERT_ARRAY_EQ(this->ptrs, ref_ptrs); } TYPED_TEST(FormatConversion, ConvertPtrsToSizesIsEquivalentToRef) { - auto ref = this->sizes; + auto ref_sizes = this->sizes; this->sizes.fill(12345); gko::kernels::EXEC_NAMESPACE::components::convert_ptrs_to_sizes( this->exec, this->ptrs.get_const_data(), this->size, this->sizes.get_data()); - GKO_ASSERT_ARRAY_EQ(this->sizes, ref); + GKO_ASSERT_ARRAY_EQ(this->sizes, ref_sizes); } From 2d27767c459123c4ccf5aaa69bd0c4e1de88dd7c Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sun, 30 Jan 2022 23:23:02 +0100 Subject: [PATCH 30/32] add shortcuts for common reduction boilerplate --- .../unified/base/kernel_launch_reduction.hpp | 10 ++++++++ .../components/reduce_array_kernels.cpp | 3 +-- .../unified/distributed/partition_kernels.cpp | 3 +-- common/unified/matrix/dense_kernels.cpp | 25 +++++++------------ common/unified/matrix/ell_kernels.cpp | 6 ++--- common/unified/matrix/sellp_kernels.cpp | 3 +-- 6 files changed, 24 insertions(+), 26 deletions(-) diff --git a/common/unified/base/kernel_launch_reduction.hpp b/common/unified/base/kernel_launch_reduction.hpp index 384c876d196..3e64ddc2819 100644 --- a/common/unified/base/kernel_launch_reduction.hpp +++ b/common/unified/base/kernel_launch_reduction.hpp @@ -37,6 +37,16 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common/unified/base/kernel_launch.hpp" +#define GKO_KERNEL_REDUCE_SUM(ValueType) \ + [] GKO_KERNEL(auto a, auto b) { return a + b; }, \ + [] GKO_KERNEL(auto a) { return a; }, ValueType \ + {} +#define GKO_KERNEL_REDUCE_MAX(ValueType) \ + [] GKO_KERNEL(auto a, auto b) { return a > b ? a : b; }, \ + [] GKO_KERNEL(auto a) { return a; }, ValueType \ + {} + + #if defined(GKO_COMPILING_CUDA) #include "cuda/base/kernel_launch_reduction.cuh" #elif defined(GKO_COMPILING_HIP) diff --git a/common/unified/components/reduce_array_kernels.cpp b/common/unified/components/reduce_array_kernels.cpp index 73913573b66..7fe0a89b795 100644 --- a/common/unified/components/reduce_array_kernels.cpp +++ b/common/unified/components/reduce_array_kernels.cpp @@ -60,8 +60,7 @@ void reduce_add_array(std::shared_ptr exec, [] GKO_KERNEL(auto i, auto array, auto result) { return i == 0 ? (array[i] + result[0]) : array[i]; }, - [] GKO_KERNEL(auto a, auto b) { return a + b; }, - [] GKO_KERNEL(auto a) { return a; }, ValueType{}, result.get_data(), + GKO_KERNEL_REDUCE_SUM(ValueType), result.get_data(), array.get_num_elems(), array, result); } diff --git a/common/unified/distributed/partition_kernels.cpp b/common/unified/distributed/partition_kernels.cpp index 1a8567cbaf9..fa5b2ed0470 100644 --- a/common/unified/distributed/partition_kernels.cpp +++ b/common/unified/distributed/partition_kernels.cpp @@ -57,8 +57,7 @@ void count_ranges(std::shared_ptr exec, auto prev_part = i == 0 ? comm_index_type{-1} : mapping[i - 1]; return cur_part != prev_part ? 1 : 0; }, - [] GKO_KERNEL(auto a, auto b) { return a + b; }, - [] GKO_KERNEL(auto a) { return a; }, size_type{}, result.get_data(), + GKO_KERNEL_REDUCE_SUM(size_type), result.get_data(), mapping.get_num_elems(), mapping); num_ranges = exec->copy_val_to_host(result.get_const_data()); } diff --git a/common/unified/matrix/dense_kernels.cpp b/common/unified/matrix/dense_kernels.cpp index eec8577ef7a..2cd9b3d49ca 100644 --- a/common/unified/matrix/dense_kernels.cpp +++ b/common/unified/matrix/dense_kernels.cpp @@ -252,9 +252,8 @@ void compute_dot(std::shared_ptr exec, [] GKO_KERNEL(auto i, auto j, auto x, auto y) { return x(i, j) * y(i, j); }, - [] GKO_KERNEL(auto a, auto b) { return a + b; }, - [] GKO_KERNEL(auto a) { return a; }, ValueType{}, result->get_values(), - x->get_size(), x, y); + GKO_KERNEL_REDUCE_SUM(ValueType), result->get_values(), x->get_size(), + x, y); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL); @@ -271,9 +270,8 @@ void compute_conj_dot(std::shared_ptr exec, [] GKO_KERNEL(auto i, auto j, auto x, auto y) { return conj(x(i, j)) * y(i, j); }, - [] GKO_KERNEL(auto a, auto b) { return a + b; }, - [] GKO_KERNEL(auto a) { return a; }, ValueType{}, result->get_values(), - x->get_size(), x, y); + GKO_KERNEL_REDUCE_SUM(ValueType), result->get_values(), x->get_size(), + x, y); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL); @@ -301,9 +299,8 @@ void compute_norm1(std::shared_ptr exec, { run_kernel_col_reduction( exec, [] GKO_KERNEL(auto i, auto j, auto x) { return abs(x(i, j)); }, - [] GKO_KERNEL(auto a, auto b) { return a + b; }, - [] GKO_KERNEL(auto a) { return a; }, remove_complex{}, - result->get_values(), x->get_size(), x); + GKO_KERNEL_REDUCE_SUM(remove_complex), result->get_values(), + x->get_size(), x); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL); @@ -318,8 +315,7 @@ void compute_max_nnz_per_row(std::shared_ptr exec, count_nonzeros_per_row(exec, source, partial.get_data()); run_kernel_reduction( exec, [] GKO_KERNEL(auto i, auto partial) { return partial[i]; }, - [] GKO_KERNEL(auto a, auto b) { return a > b ? a : b; }, - [] GKO_KERNEL(auto a) { return a; }, size_type{}, + GKO_KERNEL_REDUCE_MAX(size_type), partial.get_data() + source->get_size()[0], source->get_size()[0], partial); result = exec->copy_val_to_host(partial.get_const_data() + @@ -351,8 +347,7 @@ void compute_slice_sets(std::shared_ptr exec, stride_factor) : size_type{}; }, - [] GKO_KERNEL(auto a, auto b) { return a > b ? a : b; }, - [] GKO_KERNEL(auto a) { return a; }, size_type{}, slice_lengths, 1, + GKO_KERNEL_REDUCE_MAX(size_type), slice_lengths, 1, gko::dim<2>{num_slices, slice_size}, row_nnz, slice_size, stride_factor, num_rows); exec->copy(num_slices, slice_lengths, slice_sets); @@ -373,9 +368,7 @@ void count_nonzeros_per_row(std::shared_ptr exec, [] GKO_KERNEL(auto i, auto j, auto mtx) { return is_nonzero(mtx(i, j)) ? 1 : 0; }, - [] GKO_KERNEL(auto a, auto b) { return a + b; }, - [] GKO_KERNEL(auto a) { return a; }, IndexType{}, result, 1, - mtx->get_size(), mtx); + GKO_KERNEL_REDUCE_SUM(IndexType), result, 1, mtx->get_size(), mtx); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( diff --git a/common/unified/matrix/ell_kernels.cpp b/common/unified/matrix/ell_kernels.cpp index 1c0692f53b0..b518c538eca 100644 --- a/common/unified/matrix/ell_kernels.cpp +++ b/common/unified/matrix/ell_kernels.cpp @@ -62,8 +62,7 @@ void compute_max_row_nnz(std::shared_ptr exec, [] GKO_KERNEL(auto i, auto row_ptrs) { return row_ptrs[i + 1] - row_ptrs[i]; }, - [] GKO_KERNEL(auto a, auto b) { return a > b ? a : b; }, - [] GKO_KERNEL(auto a) { return a; }, size_type{}, result.get_data(), + GKO_KERNEL_REDUCE_MAX(size_type), result.get_data(), row_ptrs.get_num_elems() - 1, row_ptrs); max_nnz = exec->copy_val_to_host(result.get_const_data()); } @@ -169,8 +168,7 @@ void count_nonzeros_per_row(std::shared_ptr exec, const auto ell_idx = ell_col * ell_stride + row; return is_nonzero(in_vals[ell_idx]) ? 1 : 0; }, - [] GKO_KERNEL(auto a, auto b) { return a + b; }, - [] GKO_KERNEL(auto a) { return a; }, IndexType{}, result, + GKO_KERNEL_REDUCE_SUM(IndexType), result, dim<2>{source->get_num_stored_elements_per_row(), source->get_size()[0]}, static_cast(source->get_stride()), source->get_const_values()); diff --git a/common/unified/matrix/sellp_kernels.cpp b/common/unified/matrix/sellp_kernels.cpp index 1347a746a6c..c5b505aa8c1 100644 --- a/common/unified/matrix/sellp_kernels.cpp +++ b/common/unified/matrix/sellp_kernels.cpp @@ -73,8 +73,7 @@ void compute_slice_sets(std::shared_ptr exec, stride_factor) : size_type{}; }, - [] GKO_KERNEL(auto a, auto b) { return a > b ? a : b; }, - [] GKO_KERNEL(auto a) { return a; }, size_type{}, slice_lengths, 1, + GKO_KERNEL_REDUCE_MAX(size_type), slice_lengths, 1, gko::dim<2>{num_slices, slice_size}, row_ptrs, slice_size, stride_factor, num_rows); exec->copy(num_slices, slice_lengths, slice_sets); From f98170c22321390ca3401f452aa42b6e34ef658c Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 31 Jan 2022 13:43:57 +0100 Subject: [PATCH 31/32] fix CUDA version detection for triangular solve --- cuda/test/solver/lower_trs_kernels.cpp | 3 +++ cuda/test/solver/upper_trs_kernels.cpp | 3 +++ 2 files changed, 6 insertions(+) diff --git a/cuda/test/solver/lower_trs_kernels.cpp b/cuda/test/solver/lower_trs_kernels.cpp index 59ad0ab0cfc..485891f16e5 100644 --- a/cuda/test/solver/lower_trs_kernels.cpp +++ b/cuda/test/solver/lower_trs_kernels.cpp @@ -37,6 +37,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include + + #include diff --git a/cuda/test/solver/upper_trs_kernels.cpp b/cuda/test/solver/upper_trs_kernels.cpp index bfd81b72675..9e6566f9030 100644 --- a/cuda/test/solver/upper_trs_kernels.cpp +++ b/cuda/test/solver/upper_trs_kernels.cpp @@ -37,6 +37,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include + + #include From a7379ada9a1449e09aecaef479c9eb3fc477b555 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 31 Jan 2022 17:39:14 +0100 Subject: [PATCH 32/32] fix dpcpp compilation --- dpcpp/matrix/dense_kernels.dp.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp index 50477c326aa..626fd17dab0 100644 --- a/dpcpp/matrix/dense_kernels.dp.cpp +++ b/dpcpp/matrix/dense_kernels.dp.cpp @@ -86,7 +86,7 @@ namespace kernel { template void fill_in_coo(size_type num_rows, size_type num_cols, size_type stride, - const size_type* __restrict__ row_ptrs, + const int64* __restrict__ row_ptrs, const ValueType* __restrict__ source, IndexType* __restrict__ row_idxs, IndexType* __restrict__ col_idxs, @@ -659,10 +659,10 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template -void convert_to_hybrid(std::shared_ptr exec, - const matrix::Dense* source, - matrix::Hybrid* result) - GKO_NOT_IMPLEMENTED; +void convert_to_hybrid( + std::shared_ptr exec, + const matrix::Dense* source, const int64* coo_row_ptrs, + matrix::Hybrid* result) GKO_NOT_IMPLEMENTED; GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DENSE_CONVERT_TO_HYBRID_KERNEL);