diff --git a/common/cuda_hip/solver/batch_bicgstab_kernels.hpp b/common/cuda_hip/solver/batch_bicgstab_kernels.hpp index 8ea31358ed5..9aa14243de3 100644 --- a/common/cuda_hip/solver/batch_bicgstab_kernels.hpp +++ b/common/cuda_hip/solver/batch_bicgstab_kernels.hpp @@ -5,6 +5,8 @@ #ifndef GKO_COMMON_CUDA_HIP_SOLVER_BATCH_BICGSTAB_KERNELS_HPP_ #define GKO_COMMON_CUDA_HIP_SOLVER_BATCH_BICGSTAB_KERNELS_HPP_ +#include "core/solver/batch_bicgstab_kernels.hpp" + #include #include #include @@ -25,6 +27,11 @@ namespace gko { namespace kernels { namespace GKO_DEVICE_NAMESPACE { + + +constexpr int max_bicgstab_threads = 1024; + + namespace batch_single_kernels { @@ -168,12 +175,14 @@ __device__ __forceinline__ void update_x_middle( template -__global__ void apply_kernel( - const gko::kernels::batch_bicgstab::storage_config sconf, - const int max_iter, const gko::remove_complex tol, - LogType logger, PrecType prec_shared, const BatchMatrixType mat, - const ValueType* const __restrict__ b, ValueType* const __restrict__ x, - ValueType* const __restrict__ workspace = nullptr) +__global__ void __launch_bounds__(max_bicgstab_threads) + apply_kernel(const gko::kernels::batch_bicgstab::storage_config sconf, + const int max_iter, const gko::remove_complex tol, + LogType logger, PrecType prec_shared, + const BatchMatrixType mat, + const ValueType* const __restrict__ b, + ValueType* const __restrict__ x, + ValueType* const __restrict__ workspace = nullptr) { using real_type = typename gko::remove_complex; const auto num_batch_items = mat.num_batch_items; diff --git a/common/cuda_hip/solver/batch_cg_kernels.hpp b/common/cuda_hip/solver/batch_cg_kernels.hpp index 7ccdc5f9926..2c42d359fff 100644 --- a/common/cuda_hip/solver/batch_cg_kernels.hpp +++ b/common/cuda_hip/solver/batch_cg_kernels.hpp @@ -6,6 +6,8 @@ #define GKO_COMMON_CUDA_HIP_SOLVER_BATCH_CG_KERNELS_HPP_ +#include "core/solver/batch_cg_kernels.hpp" + #include #include #include @@ -27,6 +29,11 @@ namespace gko { namespace kernels { namespace GKO_DEVICE_NAMESPACE { + + +constexpr int max_cg_threads = 1024; + + namespace batch_single_kernels { @@ -113,14 +120,14 @@ __device__ __forceinline__ void update_x_and_r( template -__global__ void apply_kernel(const gko::kernels::batch_cg::storage_config sconf, - const int max_iter, - const gko::remove_complex tol, - LogType logger, PrecType prec_shared, - const BatchMatrixType mat, - const ValueType* const __restrict__ b, - ValueType* const __restrict__ x, - ValueType* const __restrict__ workspace = nullptr) +__global__ void __launch_bounds__(max_cg_threads) + apply_kernel(const gko::kernels::batch_cg::storage_config sconf, + const int max_iter, const gko::remove_complex tol, + LogType logger, PrecType prec_shared, + const BatchMatrixType mat, + const ValueType* const __restrict__ b, + ValueType* const __restrict__ x, + ValueType* const __restrict__ workspace = nullptr) { using real_type = typename gko::remove_complex; const auto num_batch_items = mat.num_batch_items; diff --git a/core/matrix/batch_struct.hpp b/core/matrix/batch_struct.hpp index a3604fd9b99..13543ccb624 100644 --- a/core/matrix/batch_struct.hpp +++ b/core/matrix/batch_struct.hpp @@ -22,14 +22,14 @@ namespace csr { /** * Encapsulates one matrix from a batch of csr matrices. */ -template +template struct batch_item { using value_type = ValueType; using index_type = IndexType; ValueType* values; - const index_type* col_idxs; - const index_type* row_ptrs; + index_type* col_idxs; + index_type* row_ptrs; index_type num_rows; index_type num_cols; index_type num_nnz_per_item; @@ -44,15 +44,15 @@ struct batch_item { /** * A 'simple' structure to store a global uniform batch of csr matrices. */ -template +template struct uniform_batch { using value_type = ValueType; using index_type = IndexType; using entry_type = batch_item; ValueType* values; - const index_type* col_idxs; - const index_type* row_ptrs; + index_type* col_idxs; + index_type* row_ptrs; size_type num_batch_items; index_type num_rows; index_type num_cols; @@ -119,13 +119,13 @@ namespace ell { /** * Encapsulates one matrix from a batch of ell matrices. */ -template +template struct batch_item { using value_type = ValueType; using index_type = IndexType; ValueType* values; - const index_type* col_idxs; + index_type* col_idxs; index_type stride; index_type num_rows; index_type num_cols; @@ -141,14 +141,14 @@ struct batch_item { /** * A 'simple' structure to store a global uniform batch of ell matrices. */ -template +template struct uniform_batch { using value_type = ValueType; using index_type = IndexType; using entry_type = batch_item; ValueType* values; - const index_type* col_idxs; + index_type* col_idxs; size_type num_batch_items; index_type stride; index_type num_rows; diff --git a/core/solver/batch_dispatch.hpp b/core/solver/batch_dispatch.hpp index 018a6674df5..3e3fd01a03c 100644 --- a/core/solver/batch_dispatch.hpp +++ b/core/solver/batch_dispatch.hpp @@ -164,6 +164,45 @@ enum class log_type { simple_convergence_completion }; } // namespace log +#define GKO_INDIRECT(...) __VA_ARGS__ + + +#define GKO_BATCH_INSTANTIATE_STOP(macro, ...) \ + GKO_INDIRECT( \ + macro(__VA_ARGS__, \ + ::gko::batch::solver::device::batch_stop::SimpleAbsResidual)); \ + template GKO_INDIRECT( \ + macro(__VA_ARGS__, \ + ::gko::batch::solver::device::batch_stop::SimpleRelResidual)) + +#define GKO_BATCH_INSTANTIATE_PRECONDITIONER(macro, ...) \ + GKO_BATCH_INSTANTIATE_STOP( \ + macro, __VA_ARGS__, \ + ::gko::batch::solver::device::batch_preconditioner::Identity); \ + template GKO_BATCH_INSTANTIATE_STOP( \ + macro, __VA_ARGS__, \ + ::gko::batch::solver::device::batch_preconditioner::ScalarJacobi); \ + template GKO_BATCH_INSTANTIATE_STOP( \ + macro, __VA_ARGS__, \ + ::gko::batch::solver::device::batch_preconditioner::BlockJacobi) + +#define GKO_BATCH_INSTANTIATE_LOGGER(macro, ...) \ + GKO_BATCH_INSTANTIATE_PRECONDITIONER( \ + macro, __VA_ARGS__, \ + ::gko::batch::solver::device::batch_log::SimpleFinalLogger) + +#define GKO_BATCH_INSTANTIATE_MATRIX(macro, ...) \ + GKO_BATCH_INSTANTIATE_LOGGER(macro, __VA_ARGS__, \ + batch::matrix::ell::uniform_batch); \ + template GKO_BATCH_INSTANTIATE_LOGGER( \ + macro, __VA_ARGS__, batch::matrix::dense::uniform_batch); \ + template GKO_BATCH_INSTANTIATE_LOGGER(macro, __VA_ARGS__, \ + batch::matrix::csr::uniform_batch) + +#define GKO_BATCH_INSTANTIATE(macro, ...) \ + GKO_BATCH_INSTANTIATE_MATRIX(macro, __VA_ARGS__) + + /** * Handles dispatching to the correct instantiation of a batched solver * depending on runtime parameters. diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index 000cb7b215f..bfa65eee79b 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -3,6 +3,8 @@ add_library(ginkgo_cuda $ "") include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake) add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/csr_kernels.instantiate.cpp CSR_INSTANTIATE) add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/fbcsr_kernels.instantiate.cpp FBCSR_INSTANTIATE) +add_instantiation_files(. solver/batch_bicgstab_launch.instantiate.cu BATCH_BICGSTAB_INSTANTIATE) +add_instantiation_files(. solver/batch_cg_launch.instantiate.cu BATCH_CG_INSTANTIATE) # we don't split up the dense kernels into distinct compilations list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp) target_sources(ginkgo_cuda @@ -21,7 +23,9 @@ target_sources(ginkgo_cuda matrix/fft_kernels.cu preconditioner/batch_jacobi_kernels.cu solver/batch_bicgstab_kernels.cu + ${BATCH_BICGSTAB_INSTANTIATE} solver/batch_cg_kernels.cu + ${BATCH_CG_INSTANTIATE} solver/lower_trs_kernels.cu solver/upper_trs_kernels.cu ${GKO_UNIFIED_COMMON_SOURCES} diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu index 8a5eee6b196..bd07259f771 100644 --- a/cuda/solver/batch_bicgstab_kernels.cu +++ b/cuda/solver/batch_bicgstab_kernels.cu @@ -5,16 +5,13 @@ #include "core/solver/batch_bicgstab_kernels.hpp" #include -#include -#include "common/cuda_hip/base/batch_struct.hpp" -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp" #include "common/cuda_hip/matrix/batch_struct.hpp" #include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp" #include "core/base/batch_struct.hpp" -#include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" +#include "cuda/solver/batch_bicgstab_launch.cuh" namespace gko { @@ -23,194 +20,121 @@ namespace cuda { namespace batch_bicgstab { -template -int get_num_threads_per_block(std::shared_ptr exec, - const int num_rows) -{ - int num_warps = std::max(num_rows / 4, 2); - constexpr int warp_sz = static_cast(config::warp_size); - const int min_block_size = 2 * warp_sz; - const int device_max_threads = - ((std::max(num_rows, min_block_size)) / warp_sz) * warp_sz; - cudaFuncAttributes funcattr; - cudaFuncGetAttributes( - &funcattr, - batch_single_kernels::apply_kernel); - const int num_regs_used = funcattr.numRegs; - int max_regs_blk = 0; - cudaDeviceGetAttribute(&max_regs_blk, cudaDevAttrMaxRegistersPerBlock, - exec->get_device_id()); - const int max_threads_regs = - ((max_regs_blk / static_cast(num_regs_used)) / warp_sz) * warp_sz; - int max_threads = std::min(max_threads_regs, device_max_threads); - max_threads = max_threads <= 1024 ? max_threads : 1024; - return std::max(std::min(num_warps * warp_sz, max_threads), min_block_size); -} - - -template -int get_max_dynamic_shared_memory(std::shared_ptr exec) -{ - int shmem_per_sm = 0; - cudaDeviceGetAttribute(&shmem_per_sm, - cudaDevAttrMaxSharedMemoryPerMultiprocessor, - exec->get_device_id()); - GKO_ASSERT_NO_CUDA_ERRORS(cudaFuncSetAttribute( - batch_single_kernels::apply_kernel, - cudaFuncAttributePreferredSharedMemoryCarveout, 99 /*%*/)); - cudaFuncAttributes funcattr; - cudaFuncGetAttributes( - &funcattr, - batch_single_kernels::apply_kernel); - return funcattr.maxDynamicSharedSizeBytes; -} - - -template -using settings = gko::kernels::batch_bicgstab::settings; - - -template +template class kernel_caller { public: - using value_type = CuValueType; + using cuda_value_type = cuda_type; kernel_caller(std::shared_ptr exec, - const settings> settings) + const settings> settings) : exec_{std::move(exec)}, settings_{settings} {} - template - void launch_apply_kernel( - const gko::kernels::batch_bicgstab::storage_config& sconf, - LogType& logger, PrecType& prec, const BatchMatrixType& mat, - const value_type* const __restrict__ b_values, - value_type* const __restrict__ x_values, - value_type* const __restrict__ workspace_data, const int& block_size, - const size_t& shared_size) const - { - batch_single_kernels::apply_kernel - <<get_stream()>>>(sconf, settings_.max_iterations, - settings_.residual_tol, logger, prec, mat, - b_values, x_values, workspace_data); - } - - template void call_kernel( LogType logger, const BatchMatrixType& mat, PrecType prec, - const gko::batch::multi_vector::uniform_batch& b, - const gko::batch::multi_vector::uniform_batch& x) const + const gko::batch::multi_vector::uniform_batch& b, + const gko::batch::multi_vector::uniform_batch& x) const { - using real_type = gko::remove_complex; + using real_type = gko::remove_complex; const size_type num_batch_items = mat.num_batch_items; constexpr int align_multiple = 8; const int padded_num_rows = ceildiv(mat.num_rows, align_multiple) * align_multiple; const int shmem_per_blk = get_max_dynamic_shared_memory(exec_); - // TODO - const int block_size = 256; - // get_num_threads_per_block( - // exec_, mat.num_rows); + BatchMatrixType, cuda_value_type>( + exec_); + const int block_size = + get_num_threads_per_block( + exec_, mat.num_rows); GKO_ASSERT(block_size >= 2 * config::warp_size); const size_t prec_size = PrecType::dynamic_work_size( padded_num_rows, mat.get_single_item_num_nnz()); - const auto sconf = - gko::kernels::batch_bicgstab::compute_shared_storage( - shmem_per_blk, padded_num_rows, mat.get_single_item_num_nnz(), - b.num_rhs); + const auto sconf = gko::kernels::batch_bicgstab::compute_shared_storage< + PrecType, cuda_value_type>(shmem_per_blk, padded_num_rows, + mat.get_single_item_num_nnz(), + b.num_rhs); const size_t shared_size = - sconf.n_shared * padded_num_rows * sizeof(value_type) + + sconf.n_shared * padded_num_rows * sizeof(cuda_value_type) + (sconf.prec_shared ? prec_size : 0); - auto workspace = gko::array( - exec_, - sconf.gmem_stride_bytes * num_batch_items / sizeof(value_type)); - GKO_ASSERT(sconf.gmem_stride_bytes % sizeof(value_type) == 0); + auto workspace = gko::array( + exec_, sconf.gmem_stride_bytes * num_batch_items / + sizeof(cuda_value_type)); + GKO_ASSERT(sconf.gmem_stride_bytes % sizeof(cuda_value_type) == 0); - value_type* const workspace_data = workspace.get_data(); + cuda_value_type* const workspace_data = workspace.get_data(); - // TODO: split compilation // Template parameters launch_apply_kernel - // if (sconf.prec_shared) { - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, workspace_data, - // block_size, shared_size); - // } else { - // switch (sconf.n_shared) { - // case 0: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, workspace_data, - block_size, shared_size); - // break; - // case 1: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 2: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 3: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 4: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 5: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 6: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 7: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 8: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 9: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // default: - // GKO_NOT_IMPLEMENTED; - // } - // } + if (sconf.prec_shared) { + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, x.values, + workspace_data, block_size, shared_size); + } else { + switch (sconf.n_shared) { + case 0: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 1: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 2: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 3: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 4: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 5: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 6: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 7: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 8: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 9: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + default: + GKO_NOT_IMPLEMENTED; + } + } } private: std::shared_ptr exec_; - const settings> settings_; + const settings> settings_; }; @@ -223,9 +147,8 @@ void apply(std::shared_ptr exec, batch::MultiVector* const x, batch::log::detail::log_data>& logdata) { - using cu_value_type = cuda_type; auto dispatcher = batch::solver::create_dispatcher( - kernel_caller(exec, settings), settings, mat, precon); + kernel_caller(exec, settings), settings, mat, precon); dispatcher.apply(b, x, logdata); } diff --git a/cuda/solver/batch_bicgstab_launch.cuh b/cuda/solver/batch_bicgstab_launch.cuh new file mode 100644 index 00000000000..5106b21251e --- /dev/null +++ b/cuda/solver/batch_bicgstab_launch.cuh @@ -0,0 +1,119 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_CUDA_SOLVER_BATCH_BICGSTAB_LAUNCH_CUH_ +#define GKO_CUDA_SOLVER_BATCH_BICGSTAB_LAUNCH_CUH_ + + +#include "common/cuda_hip/base/batch_struct.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/matrix/batch_struct.hpp" +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "core/solver/batch_bicgstab_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace cuda { +namespace batch_bicgstab { + + +template +using settings = gko::kernels::batch_bicgstab::settings; + + +template +int get_num_threads_per_block(std::shared_ptr exec, + const int num_rows); + +#define GKO_DECLARE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK_( \ + _vtype, mat_t, log_t, pre_t, stop_t) \ + int get_num_threads_per_block< \ + stop_t>, pre_t>, \ + log_t>, mat_t>, \ + cuda_type<_vtype>>(std::shared_ptr exec, \ + const int num_rows) + +#define GKO_DECLARE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK(_vtype) \ + GKO_BATCH_INSTANTIATE( \ + GKO_DECLARE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK_, _vtype) + + +template +int get_max_dynamic_shared_memory(std::shared_ptr exec); + +#define GKO_DECLARE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY_( \ + _vtype, mat_t, log_t, pre_t, stop_t) \ + int get_max_dynamic_shared_memory< \ + stop_t>, pre_t>, \ + log_t>, mat_t>, \ + cuda_type<_vtype>>(std::shared_ptr exec) + +#define GKO_DECLARE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY(_vtype) \ + GKO_BATCH_INSTANTIATE( \ + GKO_DECLARE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY_, _vtype) + + +template +void launch_apply_kernel( + std::shared_ptr exec, + const gko::kernels::batch_bicgstab::storage_config& sconf, + const settings>& settings, LogType& logger, + PrecType& prec, const BatchMatrixType& mat, + const ValueType* const __restrict__ b_values, + ValueType* const __restrict__ x_values, + ValueType* const __restrict__ workspace_data, const int& block_size, + const size_t& shared_size); + +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH(_vtype, _n_shared, _prec_shared, \ + mat_t, log_t, pre_t, stop_t) \ + void launch_apply_kernel, _n_shared, _prec_shared, \ + stop_t>>( \ + std::shared_ptr exec, \ + const gko::kernels::batch_bicgstab::storage_config& sconf, \ + const settings>>& settings, \ + log_t>>& logger, \ + pre_t>& prec, \ + const mat_t>& mat, \ + const cuda_type<_vtype>* const __restrict__ b_values, \ + cuda_type<_vtype>* const __restrict__ x_values, \ + cuda_type<_vtype>* const __restrict__ workspace_data, \ + const int& block_size, const size_t& shared_size) + +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_0_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 0, false) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_1_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 1, false) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_2_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 2, false) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_3_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 3, false) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_4_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 4, false) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_5_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 5, false) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_6_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 6, false) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_7_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 7, false) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_8_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 8, false) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 9, false) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_TRUE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 9, true) + + +} // namespace batch_bicgstab +} // namespace cuda +} // namespace kernels +} // namespace gko + + +#endif diff --git a/cuda/solver/batch_bicgstab_launch.instantiate.cu b/cuda/solver/batch_bicgstab_launch.instantiate.cu new file mode 100644 index 00000000000..ad17394c4a9 --- /dev/null +++ b/cuda/solver/batch_bicgstab_launch.instantiate.cu @@ -0,0 +1,131 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "cuda/solver/batch_bicgstab_launch.cuh" + +#include + +#include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp" +#include "core/matrix/batch_struct.hpp" +#include "core/solver/batch_bicgstab_kernels.hpp" +#include "core/solver/batch_dispatch.hpp" + + +namespace gko { +namespace kernels { +namespace cuda { +namespace batch_bicgstab { + + +template +int get_num_threads_per_block(std::shared_ptr exec, + const int num_rows) +{ + int num_warps = std::max(num_rows / 4, 2); + constexpr int warp_sz = static_cast(config::warp_size); + const int min_block_size = 2 * warp_sz; + const int device_max_threads = + (std::max(num_rows, min_block_size) / warp_sz) * warp_sz; + auto get_num_regs = [](const auto func) { + cudaFuncAttributes funcattr; + cudaFuncGetAttributes(&funcattr, func); + return funcattr.numRegs; + }; + const int num_regs_used = std::max( + get_num_regs( + batch_single_kernels::apply_kernel), + get_num_regs( + batch_single_kernels::apply_kernel)); + int max_regs_blk = 0; + cudaDeviceGetAttribute(&max_regs_blk, cudaDevAttrMaxRegistersPerBlock, + exec->get_device_id()); + const int max_threads_regs = + ((max_regs_blk / static_cast(num_regs_used)) / warp_sz) * warp_sz; + int max_threads = std::min(max_threads_regs, device_max_threads); + max_threads = max_threads <= max_bicgstab_threads ? max_threads + : max_bicgstab_threads; + return std::max(std::min(num_warps * warp_sz, max_threads), min_block_size); +} + + +template +int get_max_dynamic_shared_memory(std::shared_ptr exec) +{ + int shmem_per_sm = 0; + cudaDeviceGetAttribute(&shmem_per_sm, + cudaDevAttrMaxSharedMemoryPerMultiprocessor, + exec->get_device_id()); + GKO_ASSERT_NO_CUDA_ERRORS(cudaFuncSetAttribute( + batch_single_kernels::apply_kernel, + cudaFuncAttributePreferredSharedMemoryCarveout, 99 /*%*/)); + cudaFuncAttributes funcattr; + cudaFuncGetAttributes( + &funcattr, + batch_single_kernels::apply_kernel); + return funcattr.maxDynamicSharedSizeBytes; +} + + +template +void launch_apply_kernel( + std::shared_ptr exec, + const gko::kernels::batch_bicgstab::storage_config& sconf, + const settings>& settings, LogType& logger, + PrecType& prec, const BatchMatrixType& mat, + const ValueType* const __restrict__ b_values, + ValueType* const __restrict__ x_values, + ValueType* const __restrict__ workspace_data, const int& block_size, + const size_t& shared_size) +{ + batch_single_kernels::apply_kernel + <<get_stream()>>>( + sconf, settings.max_iterations, as_cuda_type(settings.residual_tol), + logger, prec, mat, b_values, x_values, workspace_data); +} + + +// begin +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_0_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_1_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_2_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_3_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_4_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_5_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_6_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_7_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_8_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_TRUE); +// end + + +} // namespace batch_bicgstab +} // namespace cuda +} // namespace kernels +} // namespace gko diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu index 32e66d7ee54..126a62006cf 100644 --- a/cuda/solver/batch_cg_kernels.cu +++ b/cuda/solver/batch_cg_kernels.cu @@ -5,16 +5,13 @@ #include "core/solver/batch_cg_kernels.hpp" #include -#include -#include "common/cuda_hip/base/batch_struct.hpp" -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp" #include "common/cuda_hip/matrix/batch_struct.hpp" #include "common/cuda_hip/solver/batch_cg_kernels.hpp" #include "core/base/batch_struct.hpp" -#include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" +#include "cuda/solver/batch_cg_launch.cuh" namespace gko { @@ -23,104 +20,35 @@ namespace cuda { namespace batch_cg { -template -int get_num_threads_per_block(std::shared_ptr exec, - const int num_rows) -{ - int num_warps = std::max(num_rows / 4, 2); - constexpr int warp_sz = static_cast(config::warp_size); - const int min_block_size = 2 * warp_sz; - const int device_max_threads = - (std::max(num_rows, min_block_size) / warp_sz) * warp_sz; - cudaFuncAttributes funcattr; - cudaFuncGetAttributes( - &funcattr, - batch_single_kernels::apply_kernel); - const int num_regs_used = funcattr.numRegs; - int max_regs_blk = 0; - cudaDeviceGetAttribute(&max_regs_blk, cudaDevAttrMaxRegistersPerBlock, - exec->get_device_id()); - const int max_threads_regs = - ((max_regs_blk / static_cast(num_regs_used)) / warp_sz) * warp_sz; - int max_threads = std::min(max_threads_regs, device_max_threads); - max_threads = max_threads <= 1024 ? max_threads : 1024; - return std::max(std::min(num_warps * warp_sz, max_threads), min_block_size); -} - - -template -int get_max_dynamic_shared_memory(std::shared_ptr exec) -{ - int shmem_per_sm = 0; - cudaDeviceGetAttribute(&shmem_per_sm, - cudaDevAttrMaxSharedMemoryPerMultiprocessor, - exec->get_device_id()); - GKO_ASSERT_NO_CUDA_ERRORS(cudaFuncSetAttribute( - batch_single_kernels::apply_kernel, - cudaFuncAttributePreferredSharedMemoryCarveout, 99 /*%*/)); - cudaFuncAttributes funcattr; - cudaFuncGetAttributes( - &funcattr, - batch_single_kernels::apply_kernel); - return funcattr.maxDynamicSharedSizeBytes; -} - - -template -using settings = gko::kernels::batch_cg::settings; - - -template +template class kernel_caller { public: - using value_type = CuValueType; + using cuda_value_type = cuda_type; kernel_caller(std::shared_ptr exec, - const settings> settings) + const settings> settings) : exec_{std::move(exec)}, settings_{settings} {} - template - void launch_apply_kernel( - const gko::kernels::batch_cg::storage_config& sconf, LogType& logger, - PrecType& prec, const BatchMatrixType& mat, - const value_type* const __restrict__ b_values, - value_type* const __restrict__ x_values, - value_type* const __restrict__ workspace_data, const int& block_size, - const size_t& shared_size) const - { - batch_single_kernels::apply_kernel - <<get_stream()>>>(sconf, settings_.max_iterations, - settings_.residual_tol, logger, prec, mat, - b_values, x_values, workspace_data); - } - template void call_kernel( LogType logger, const BatchMatrixType& mat, PrecType prec, - const gko::batch::multi_vector::uniform_batch& b, - const gko::batch::multi_vector::uniform_batch& x) const + const gko::batch::multi_vector::uniform_batch& b, + const gko::batch::multi_vector::uniform_batch& x) const { - using real_type = gko::remove_complex; + using real_type = gko::remove_complex; const size_type num_batch_items = mat.num_batch_items; constexpr int align_multiple = 8; const int padded_num_rows = ceildiv(mat.num_rows, align_multiple) * align_multiple; const int shmem_per_blk = get_max_dynamic_shared_memory(exec_); + BatchMatrixType, cuda_value_type>( + exec_); const int block_size = get_num_threads_per_block( + BatchMatrixType, cuda_value_type>( exec_, mat.num_rows); GKO_ASSERT(block_size >= 2 * config::warp_size); @@ -128,69 +56,66 @@ public: padded_num_rows, mat.get_single_item_num_nnz()); const auto sconf = gko::kernels::batch_cg::compute_shared_storage( + cuda_value_type>( shmem_per_blk, padded_num_rows, mat.get_single_item_num_nnz(), b.num_rhs); const size_t shared_size = - sconf.n_shared * padded_num_rows * sizeof(value_type) + + sconf.n_shared * padded_num_rows * sizeof(cuda_value_type) + (sconf.prec_shared ? prec_size : 0); - auto workspace = gko::array( - exec_, - sconf.gmem_stride_bytes * num_batch_items / sizeof(value_type)); - GKO_ASSERT(sconf.gmem_stride_bytes % sizeof(value_type) == 0); - - value_type* const workspace_data = workspace.get_data(); - - // TODO: split compilation - // Only instantiate when full optimizations has been enabled. Otherwise, - // just use the default one with no shared memory. - // Template parameters launch_apply_kernel - // if (sconf.prec_shared) { - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, workspace_data, - // block_size, shared_size); - // } else { - // switch (sconf.n_shared) { - // case 0: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, workspace_data, - block_size, shared_size); - // break; - // case 1: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 2: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 3: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 4: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 5: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // default: - // GKO_NOT_IMPLEMENTED; - // } - // } + auto workspace = gko::array( + exec_, sconf.gmem_stride_bytes * num_batch_items / + sizeof(cuda_value_type)); + GKO_ASSERT(sconf.gmem_stride_bytes % sizeof(cuda_value_type) == 0); + + cuda_value_type* const workspace_data = workspace.get_data(); + + // Template parameters launch_apply_kernel + if (sconf.prec_shared) { + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, x.values, + workspace_data, block_size, shared_size); + } else { + switch (sconf.n_shared) { + case 0: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 1: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 2: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 3: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 4: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 5: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + default: + GKO_NOT_IMPLEMENTED; + } + } } private: std::shared_ptr exec_; - const settings> settings_; + const settings> settings_; }; @@ -203,9 +128,8 @@ void apply(std::shared_ptr exec, batch::MultiVector* const x, batch::log::detail::log_data>& logdata) { - using cu_value_type = cuda_type; auto dispatcher = batch::solver::create_dispatcher( - kernel_caller(exec, settings), settings, mat, precon); + kernel_caller(exec, settings), settings, mat, precon); dispatcher.apply(b, x, logdata); } diff --git a/cuda/solver/batch_cg_launch.cuh b/cuda/solver/batch_cg_launch.cuh new file mode 100644 index 00000000000..9cb470eb51b --- /dev/null +++ b/cuda/solver/batch_cg_launch.cuh @@ -0,0 +1,111 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_CUDA_SOLVER_BATCH_CG_LAUNCH_CUH_ +#define GKO_CUDA_SOLVER_BATCH_CG_LAUNCH_CUH_ + + +#include "common/cuda_hip/base/batch_struct.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/matrix/batch_struct.hpp" +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "core/solver/batch_cg_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace cuda { +namespace batch_cg { + + +template +using settings = gko::kernels::batch_cg::settings; + + +template +int get_num_threads_per_block(std::shared_ptr exec, + const int num_rows); + +#define GKO_DECLARE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK_(_vtype, mat_t, log_t, \ + pre_t, stop_t) \ + int get_num_threads_per_block< \ + stop_t>, pre_t>, \ + log_t>>, \ + mat_t>, cuda_type<_vtype>>( \ + std::shared_ptr exec, const int num_rows) + +#define GKO_DECLARE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK_, \ + _vtype) + + +template +int get_max_dynamic_shared_memory(std::shared_ptr exec); + +#define GKO_DECLARE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY_( \ + _vtype, mat_t, log_t, pre_t, stop_t) \ + int get_max_dynamic_shared_memory< \ + stop_t>, pre_t>, \ + log_t>, mat_t>, \ + cuda_type<_vtype>>(std::shared_ptr exec) + +#define GKO_DECLARE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY_, \ + _vtype) + + +template +void launch_apply_kernel( + std::shared_ptr exec, + const gko::kernels::batch_cg::storage_config& sconf, + const settings>& settings, LogType& logger, + PrecType& prec, const BatchMatrixType& mat, + const cuda_type* const __restrict__ b_values, + cuda_type* const __restrict__ x_values, + cuda_type* const __restrict__ workspace_data, + const int& block_size, const size_t& shared_size); + +#define GKO_DECLARE_BATCH_CG_LAUNCH(_vtype, _n_shared, _prec_shared, mat_t, \ + log_t, pre_t, stop_t) \ + void launch_apply_kernel, _n_shared, _prec_shared, \ + stop_t>>( \ + std::shared_ptr exec, \ + const gko::kernels::batch_cg::storage_config& sconf, \ + const settings>& settings, \ + log_t>>>& logger, \ + pre_t>& prec, \ + const mat_t>& mat, \ + const cuda_type<_vtype>* const __restrict__ b_values, \ + cuda_type<_vtype>* const __restrict__ x_values, \ + cuda_type<_vtype>* const __restrict__ workspace_data, \ + const int& block_size, const size_t& shared_size) + +#define GKO_DECLARE_BATCH_CG_LAUNCH_0_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 0, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_1_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 1, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_2_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 2, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_3_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 3, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_4_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 4, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_5_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 5, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_5_TRUE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 5, true) + + +} // namespace batch_cg +} // namespace cuda +} // namespace kernels +} // namespace gko + + +#endif diff --git a/cuda/solver/batch_cg_launch.instantiate.cu b/cuda/solver/batch_cg_launch.instantiate.cu new file mode 100644 index 00000000000..89e96e85ace --- /dev/null +++ b/cuda/solver/batch_cg_launch.instantiate.cu @@ -0,0 +1,122 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "cuda/solver/batch_cg_launch.cuh" + +#include + +#include "common/cuda_hip/solver/batch_cg_kernels.hpp" +#include "core/matrix/batch_struct.hpp" +#include "core/solver/batch_cg_kernels.hpp" +#include "core/solver/batch_dispatch.hpp" + + +namespace gko { +namespace kernels { +namespace cuda { +namespace batch_cg { + + +template +int get_num_threads_per_block(std::shared_ptr exec, + const int num_rows) +{ + int num_warps = std::max(num_rows / 4, 2); + constexpr int warp_sz = static_cast(config::warp_size); + const int min_block_size = 2 * warp_sz; + const int device_max_threads = + (std::max(num_rows, min_block_size) / warp_sz) * warp_sz; + auto get_num_regs = [](const auto func) { + cudaFuncAttributes funcattr; + cudaFuncGetAttributes(&funcattr, func); + return funcattr.numRegs; + }; + const int num_regs_used = std::max( + get_num_regs( + batch_single_kernels::apply_kernel), + get_num_regs( + batch_single_kernels::apply_kernel)); + int max_regs_blk = 0; + cudaDeviceGetAttribute(&max_regs_blk, cudaDevAttrMaxRegistersPerBlock, + exec->get_device_id()); + const int max_threads_regs = + ((max_regs_blk / static_cast(num_regs_used)) / warp_sz) * warp_sz; + int max_threads = std::min(max_threads_regs, device_max_threads); + max_threads = max_threads <= max_cg_threads ? max_threads : max_cg_threads; + return std::max(std::min(num_warps * warp_sz, max_threads), min_block_size); +} + + +template +int get_max_dynamic_shared_memory(std::shared_ptr exec) +{ + int shmem_per_sm = 0; + cudaDeviceGetAttribute(&shmem_per_sm, + cudaDevAttrMaxSharedMemoryPerMultiprocessor, + exec->get_device_id()); + GKO_ASSERT_NO_CUDA_ERRORS(cudaFuncSetAttribute( + batch_single_kernels::apply_kernel, + cudaFuncAttributePreferredSharedMemoryCarveout, 99 /*%*/)); + cudaFuncAttributes funcattr; + cudaFuncGetAttributes( + &funcattr, + batch_single_kernels::apply_kernel); + return funcattr.maxDynamicSharedSizeBytes; +} + + +template +void launch_apply_kernel( + std::shared_ptr exec, + const gko::kernels::batch_cg::storage_config& sconf, + const settings>& settings, LogType& logger, + PrecType& prec, const BatchMatrixType& mat, + const cuda_type* const __restrict__ b_values, + cuda_type* const __restrict__ x_values, + cuda_type* const __restrict__ workspace_data, + const int& block_size, const size_t& shared_size) +{ + batch_single_kernels::apply_kernel + <<get_stream()>>>( + sconf, settings.max_iterations, as_cuda_type(settings.residual_tol), + logger, prec, mat, b_values, x_values, workspace_data); +} + + +// begin +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_0_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_1_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_2_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_3_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_4_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_5_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_5_TRUE); +// end + + +} // namespace batch_cg +} // namespace cuda +} // namespace kernels +} // namespace gko diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index 7d914d57a81..4a540046322 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -2,6 +2,8 @@ cmake_minimum_required(VERSION 3.21) include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake) add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/csr_kernels.instantiate.cpp CSR_INSTANTIATE) add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/fbcsr_kernels.instantiate.cpp FBCSR_INSTANTIATE) +add_instantiation_files(. solver/batch_bicgstab_launch.instantiate.hip.cpp BATCH_BICGSTAB_INSTANTIATE) +add_instantiation_files(. solver/batch_cg_launch.instantiate.hip.cpp BATCH_CG_INSTANTIATE) # we don't split up the dense kernels into distinct compilations list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp) set(GINKGO_HIP_SOURCES @@ -18,7 +20,9 @@ set(GINKGO_HIP_SOURCES ${FBCSR_INSTANTIATE} preconditioner/batch_jacobi_kernels.hip.cpp solver/batch_bicgstab_kernels.hip.cpp + ${BATCH_BICGSTAB_INSTANTIATE} solver/batch_cg_kernels.hip.cpp + ${BATCH_CG_INSTANTIATE} solver/lower_trs_kernels.hip.cpp solver/upper_trs_kernels.hip.cpp ${GKO_UNIFIED_COMMON_SOURCES} diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp index 17199d2cd19..697bcb94551 100644 --- a/hip/solver/batch_bicgstab_kernels.hip.cpp +++ b/hip/solver/batch_bicgstab_kernels.hip.cpp @@ -5,19 +5,13 @@ #include "core/solver/batch_bicgstab_kernels.hpp" #include -#include #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp" -#include "common/cuda_hip/base/batch_struct.hpp" -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/math.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/matrix/batch_struct.hpp" #include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp" #include "core/base/batch_struct.hpp" -#include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" +#include "hip/solver/batch_bicgstab_launch.hip.hpp" namespace gko { @@ -51,47 +45,24 @@ int get_num_threads_per_block(std::shared_ptr exec, } -template -using settings = gko::kernels::batch_bicgstab::settings; - - -template +template class kernel_caller { public: - using value_type = HipValueType; + using hip_value_type = hip_type; kernel_caller(std::shared_ptr exec, - const settings> settings) + const settings> settings) : exec_{exec}, settings_{settings} {} - template - void launch_apply_kernel( - const gko::kernels::batch_bicgstab::storage_config& sconf, - LogType& logger, PrecType& prec, const BatchMatrixType& mat, - const value_type* const __restrict__ b_values, - value_type* const __restrict__ x_values, - value_type* const __restrict__ workspace_data, const int& block_size, - const size_t& shared_size) const - { - batch_single_kernels::apply_kernel - <<get_stream()>>>(sconf, settings_.max_iterations, - settings_.residual_tol, logger, prec, mat, - b_values, x_values, workspace_data); - } - - template void call_kernel( LogType logger, const BatchMatrixType& mat, PrecType prec, - const gko::batch::multi_vector::uniform_batch& b, - const gko::batch::multi_vector::uniform_batch& x) const + const gko::batch::multi_vector::uniform_batch& b, + const gko::batch::multi_vector::uniform_batch& x) const { - using real_type = gko::remove_complex; + using real_type = gko::remove_complex; const size_type num_batch_items = mat.num_batch_items; constexpr int align_multiple = 8; const int padded_num_rows = @@ -102,99 +73,92 @@ class kernel_caller { exec_->get_device_id())); const int block_size = get_num_threads_per_block(exec_, mat.num_rows); - bool is_block_size_aligned = block_size % config::warp_size == 0; GKO_ASSERT(block_size >= 2 * config::warp_size); - GKO_ASSERT(is_block_size_aligned); + GKO_ASSERT(block_size % config::warp_size == 0); // Returns amount required in bytes const size_t prec_size = PrecType::dynamic_work_size( padded_num_rows, mat.get_single_item_num_nnz()); - const auto sconf = - gko::kernels::batch_bicgstab::compute_shared_storage( - shmem_per_blk, padded_num_rows, mat.get_single_item_num_nnz(), - b.num_rhs); + const auto sconf = gko::kernels::batch_bicgstab::compute_shared_storage< + PrecType, hip_value_type>(shmem_per_blk, padded_num_rows, + mat.get_single_item_num_nnz(), b.num_rhs); const size_t shared_size = - sconf.n_shared * padded_num_rows * sizeof(value_type) + + sconf.n_shared * padded_num_rows * sizeof(hip_value_type) + (sconf.prec_shared ? prec_size : 0); - auto workspace = gko::array( + auto workspace = gko::array( exec_, - sconf.gmem_stride_bytes * num_batch_items / sizeof(value_type)); - bool is_stride_aligned = - sconf.gmem_stride_bytes % sizeof(value_type) == 0; - GKO_ASSERT(is_stride_aligned); + sconf.gmem_stride_bytes * num_batch_items / sizeof(hip_value_type)); + GKO_ASSERT(sconf.gmem_stride_bytes % sizeof(hip_value_type) == 0); - value_type* const workspace_data = workspace.get_data(); + hip_value_type* const workspace_data = workspace.get_data(); - // Only instantiate when full optimizations has been enabled. Otherwise, - // just use the default one with no shared memory. // Template parameters launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, workspace_data, - // block_size, shared_size); - // } else { - // switch (sconf.n_shared) { - // case 0: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, workspace_data, - block_size, shared_size); - // break; - // case 1: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 2: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 3: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 4: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 5: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 6: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 7: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 8: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 9: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // default: - // GKO_NOT_IMPLEMENTED; - // } - // } + if (sconf.prec_shared) { + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, x.values, + workspace_data, block_size, shared_size); + } else { + switch (sconf.n_shared) { + case 0: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 1: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 2: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 3: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 4: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 5: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 6: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 7: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 8: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 9: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + default: + GKO_NOT_IMPLEMENTED; + } + } } private: std::shared_ptr exec_; - const settings> settings_; + const settings> settings_; }; @@ -207,9 +171,8 @@ void apply(std::shared_ptr exec, batch::MultiVector* const x, batch::log::detail::log_data>& logdata) { - using hip_value_type = hip_type; auto dispatcher = batch::solver::create_dispatcher( - kernel_caller(exec, settings), settings, mat, precon); + kernel_caller(exec, settings), settings, mat, precon); dispatcher.apply(b, x, logdata); } diff --git a/hip/solver/batch_bicgstab_launch.hip.hpp b/hip/solver/batch_bicgstab_launch.hip.hpp new file mode 100644 index 00000000000..0f62a9487a3 --- /dev/null +++ b/hip/solver/batch_bicgstab_launch.hip.hpp @@ -0,0 +1,85 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_HIP_SOLVER_BATCH_BICGSTAB_LAUNCH_HIP_HPP_ +#define GKO_HIP_SOLVER_BATCH_BICGSTAB_LAUNCH_HIP_HPP_ + + +#include "common/cuda_hip/base/batch_struct.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/matrix/batch_struct.hpp" +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "core/solver/batch_bicgstab_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +namespace batch_bicgstab { + + +template +using settings = gko::kernels::batch_bicgstab::settings; + + +template +void launch_apply_kernel( + std::shared_ptr exec, + const gko::kernels::batch_bicgstab::storage_config& sconf, + const settings>& settings, LogType& logger, + PrecType& prec, const BatchMatrixType& mat, + const hip_type* const __restrict__ b_values, + hip_type* const __restrict__ x_values, + hip_type* const __restrict__ workspace_data, + const int& block_size, const size_t& shared_size); + +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH(_vtype, _n_shared, _prec_shared, \ + mat_t, log_t, pre_t, stop_t) \ + void launch_apply_kernel<_vtype, _n_shared, _prec_shared, \ + stop_t>>( \ + std::shared_ptr exec, \ + const gko::kernels::batch_bicgstab::storage_config& sconf, \ + const settings>& settings, \ + log_t>>& logger, \ + pre_t>& prec, \ + const mat_t>& mat, \ + const hip_type<_vtype>* const __restrict__ b_values, \ + hip_type<_vtype>* const __restrict__ x_values, \ + hip_type<_vtype>* const __restrict__ workspace_data, \ + const int& block_size, const size_t& shared_size) + +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_0_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 0, false) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_1_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 1, false) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_2_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 2, false) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_3_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 3, false) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_4_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 4, false) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_5_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 5, false) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_6_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 6, false) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_7_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 7, false) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_8_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 8, false) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 9, false) +#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_TRUE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 9, true) + + +} // namespace batch_bicgstab +} // namespace hip +} // namespace kernels +} // namespace gko + + +#endif diff --git a/hip/solver/batch_bicgstab_launch.instantiate.hip.cpp b/hip/solver/batch_bicgstab_launch.instantiate.hip.cpp new file mode 100644 index 00000000000..fb26c562a94 --- /dev/null +++ b/hip/solver/batch_bicgstab_launch.instantiate.hip.cpp @@ -0,0 +1,67 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include + +#include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp" +#include "core/matrix/batch_struct.hpp" +#include "core/solver/batch_bicgstab_kernels.hpp" +#include "core/solver/batch_dispatch.hpp" +#include "hip/solver/batch_bicgstab_launch.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +namespace batch_bicgstab { + + +template +void launch_apply_kernel( + std::shared_ptr exec, + const gko::kernels::batch_bicgstab::storage_config& sconf, + const settings>& settings, LogType& logger, + PrecType& prec, const BatchMatrixType& mat, + const hip_type* const __restrict__ b_values, + hip_type* const __restrict__ x_values, + hip_type* const __restrict__ workspace_data, + const int& block_size, const size_t& shared_size) +{ + batch_single_kernels::apply_kernel + <<get_stream()>>>( + sconf, settings.max_iterations, as_hip_type(settings.residual_tol), + logger, prec, mat, b_values, x_values, workspace_data); +} + + +// begin +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_0_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_1_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_2_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_3_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_4_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_5_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_6_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_7_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_8_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_TRUE); +// end + + +} // namespace batch_bicgstab +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp index 6d5e3bff3b3..25ebd667a7e 100644 --- a/hip/solver/batch_cg_kernels.hip.cpp +++ b/hip/solver/batch_cg_kernels.hip.cpp @@ -5,18 +5,13 @@ #include "core/solver/batch_cg_kernels.hpp" #include -#include -#include "common/cuda_hip/base/batch_struct.hpp" -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/math.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp" #include "common/cuda_hip/matrix/batch_struct.hpp" #include "common/cuda_hip/solver/batch_cg_kernels.hpp" #include "core/base/batch_struct.hpp" -#include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" +#include "hip/solver/batch_cg_launch.hip.hpp" namespace gko { @@ -50,47 +45,24 @@ int get_num_threads_per_block(std::shared_ptr exec, } -template -using settings = gko::kernels::batch_cg::settings; - - -template +template class kernel_caller { public: - using value_type = HipValueType; + using hip_value_type = hip_type; kernel_caller(std::shared_ptr exec, - const settings> settings) + const settings> settings) : exec_{exec}, settings_{settings} {} - template - void launch_apply_kernel( - const gko::kernels::batch_cg::storage_config& sconf, LogType& logger, - PrecType& prec, const BatchMatrixType& mat, - const value_type* const __restrict__ b_values, - value_type* const __restrict__ x_values, - value_type* const __restrict__ workspace_data, const int& block_size, - const size_t& shared_size) const - { - batch_single_kernels::apply_kernel - <<get_stream()>>>(sconf, settings_.max_iterations, - settings_.residual_tol, logger, prec, mat, - b_values, x_values, workspace_data); - } - - template void call_kernel( LogType logger, const BatchMatrixType& mat, PrecType prec, - const gko::batch::multi_vector::uniform_batch& b, - const gko::batch::multi_vector::uniform_batch& x) const + const gko::batch::multi_vector::uniform_batch& b, + const gko::batch::multi_vector::uniform_batch& x) const { - using real_type = gko::remove_complex; + using real_type = gko::remove_complex; const size_type num_batch_items = mat.num_batch_items; constexpr int align_multiple = 8; const int padded_num_rows = @@ -101,79 +73,74 @@ class kernel_caller { exec_->get_device_id())); const int block_size = get_num_threads_per_block(exec_, mat.num_rows); - bool is_block_size_aligned = block_size % config::warp_size == 0; GKO_ASSERT(block_size >= 2 * config::warp_size); - GKO_ASSERT(is_block_size_aligned); + GKO_ASSERT(block_size % config::warp_size == 0); // Returns amount required in bytes const size_t prec_size = PrecType::dynamic_work_size( padded_num_rows, mat.get_single_item_num_nnz()); const auto sconf = gko::kernels::batch_cg::compute_shared_storage( + hip_value_type>( shmem_per_blk, padded_num_rows, mat.get_single_item_num_nnz(), b.num_rhs); const size_t shared_size = - sconf.n_shared * padded_num_rows * sizeof(value_type) + + sconf.n_shared * padded_num_rows * sizeof(hip_value_type) + (sconf.prec_shared ? prec_size : 0); - auto workspace = gko::array( + auto workspace = gko::array( exec_, - sconf.gmem_stride_bytes * num_batch_items / sizeof(value_type)); - bool is_stride_aligned = - sconf.gmem_stride_bytes % sizeof(value_type) == 0; - GKO_ASSERT(is_stride_aligned); - - value_type* const workspace_data = workspace.get_data(); - - // Only instantiate when full optimizations has been enabled. Otherwise, - // just use the default one with no shared memory. - // Template parameters launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, workspace_data, - // block_size, shared_size); - // } else { - // switch (sconf.n_shared) { - // case 0: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, workspace_data, - block_size, shared_size); - // break; - // case 1: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 2: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 3: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 4: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // case 5: - // launch_apply_kernel( - // sconf, logger, prec, mat, b.values, x.values, - // workspace_data, block_size, shared_size); - // break; - // default: - // GKO_NOT_IMPLEMENTED; - // } - // } + sconf.gmem_stride_bytes * num_batch_items / sizeof(hip_value_type)); + GKO_ASSERT(sconf.gmem_stride_bytes % sizeof(hip_value_type) == 0); + + hip_value_type* const workspace_data = workspace.get_data(); + + // Template parameters launch_apply_kernel + if (sconf.prec_shared) { + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, x.values, + workspace_data, block_size, shared_size); + } else { + switch (sconf.n_shared) { + case 0: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 1: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 2: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 3: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 4: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 5: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + default: + GKO_NOT_IMPLEMENTED; + } + } } private: std::shared_ptr exec_; - const settings> settings_; + const settings> settings_; }; diff --git a/hip/solver/batch_cg_launch.hip.hpp b/hip/solver/batch_cg_launch.hip.hpp new file mode 100644 index 00000000000..7071c5c4065 --- /dev/null +++ b/hip/solver/batch_cg_launch.hip.hpp @@ -0,0 +1,77 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_HIP_SOLVER_BATCH_CG_LAUNCH_HPP_ +#define GKO_HIP_SOLVER_BATCH_CG_LAUNCH_HPP_ + + +#include "common/cuda_hip/base/batch_struct.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/matrix/batch_struct.hpp" +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "core/solver/batch_cg_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +namespace batch_cg { + + +template +using settings = gko::kernels::batch_cg::settings; + + +template +void launch_apply_kernel(std::shared_ptr exec, + const gko::kernels::batch_cg::storage_config& sconf, + const settings>& settings, + LogType& logger, PrecType& prec, + const BatchMatrixType& mat, + const hip_type* const __restrict__ b_values, + hip_type* const __restrict__ x_values, + hip_type* const __restrict__ workspace_data, + const int& block_size, const size_t& shared_size); + +#define GKO_DECLARE_BATCH_CG_LAUNCH(_vtype, _n_shared, _prec_shared, mat_t, \ + log_t, pre_t, stop_t) \ + void launch_apply_kernel, _n_shared, _prec_shared, \ + stop_t>>( \ + std::shared_ptr exec, \ + const gko::kernels::batch_cg::storage_config& sconf, \ + const settings>& settings, \ + log_t>>>& logger, \ + pre_t>& prec, \ + const mat_t>& mat, \ + const hip_type<_vtype>* const __restrict__ b_values, \ + hip_type<_vtype>* const __restrict__ x_values, \ + hip_type<_vtype>* const __restrict__ workspace_data, \ + const int& block_size, const size_t& shared_size) + +#define GKO_DECLARE_BATCH_CG_LAUNCH_0_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 0, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_1_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 1, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_2_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 2, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_3_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 3, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_4_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 4, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_5_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 5, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_5_TRUE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 5, true) + + +} // namespace batch_cg +} // namespace hip +} // namespace kernels +} // namespace gko + + +#endif diff --git a/hip/solver/batch_cg_launch.instantiate.hip.cpp b/hip/solver/batch_cg_launch.instantiate.hip.cpp new file mode 100644 index 00000000000..3605a88651d --- /dev/null +++ b/hip/solver/batch_cg_launch.instantiate.hip.cpp @@ -0,0 +1,59 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include + +#include "common/cuda_hip/solver/batch_cg_kernels.hpp" +#include "core/matrix/batch_struct.hpp" +#include "core/solver/batch_cg_kernels.hpp" +#include "core/solver/batch_dispatch.hpp" +#include "hip/solver/batch_cg_launch.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +namespace batch_cg { + + +template +void launch_apply_kernel(std::shared_ptr exec, + const gko::kernels::batch_cg::storage_config& sconf, + const settings>& settings, + LogType& logger, PrecType& prec, + const BatchMatrixType& mat, + const hip_type* const __restrict__ b_values, + hip_type* const __restrict__ x_values, + hip_type* const __restrict__ workspace_data, + const int& block_size, const size_t& shared_size) +{ + batch_single_kernels::apply_kernel + <<get_stream()>>>( + sconf, settings.max_iterations, as_hip_type(settings.residual_tol), + logger, prec, mat, b_values, x_values, workspace_data); +} + + +// begin +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_0_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_1_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_2_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_3_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_4_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_5_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_5_TRUE); +// end + + +} // namespace batch_cg +} // namespace hip +} // namespace kernels +} // namespace gko