diff --git a/common/cuda_hip/solver/batch_bicgstab_kernels.hpp b/common/cuda_hip/solver/batch_bicgstab_kernels.hpp
index 8ea31358ed5..9aa14243de3 100644
--- a/common/cuda_hip/solver/batch_bicgstab_kernels.hpp
+++ b/common/cuda_hip/solver/batch_bicgstab_kernels.hpp
@@ -5,6 +5,8 @@
 #ifndef GKO_COMMON_CUDA_HIP_SOLVER_BATCH_BICGSTAB_KERNELS_HPP_
 #define GKO_COMMON_CUDA_HIP_SOLVER_BATCH_BICGSTAB_KERNELS_HPP_
 
+#include "core/solver/batch_bicgstab_kernels.hpp"
+
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
@@ -25,6 +27,11 @@
 namespace gko {
 namespace kernels {
 namespace GKO_DEVICE_NAMESPACE {
+
+
+constexpr int max_bicgstab_threads = 1024;
+
+
 namespace batch_single_kernels {
 
 
@@ -168,12 +175,14 @@ __device__ __forceinline__ void update_x_middle(
 template <typename StopType, int n_shared, bool prec_shared_bool,
           typename PrecType, typename LogType, typename BatchMatrixType,
           typename ValueType>
-__global__ void apply_kernel(
-    const gko::kernels::batch_bicgstab::storage_config sconf,
-    const int max_iter, const gko::remove_complex<ValueType> tol,
-    LogType logger, PrecType prec_shared, const BatchMatrixType mat,
-    const ValueType* const __restrict__ b, ValueType* const __restrict__ x,
-    ValueType* const __restrict__ workspace = nullptr)
+__global__ void __launch_bounds__(max_bicgstab_threads)
+    apply_kernel(const gko::kernels::batch_bicgstab::storage_config sconf,
+                 const int max_iter, const gko::remove_complex<ValueType> tol,
+                 LogType logger, PrecType prec_shared,
+                 const BatchMatrixType mat,
+                 const ValueType* const __restrict__ b,
+                 ValueType* const __restrict__ x,
+                 ValueType* const __restrict__ workspace = nullptr)
 {
     using real_type = typename gko::remove_complex<ValueType>;
     const auto num_batch_items = mat.num_batch_items;
diff --git a/common/cuda_hip/solver/batch_cg_kernels.hpp b/common/cuda_hip/solver/batch_cg_kernels.hpp
index 7ccdc5f9926..2c42d359fff 100644
--- a/common/cuda_hip/solver/batch_cg_kernels.hpp
+++ b/common/cuda_hip/solver/batch_cg_kernels.hpp
@@ -6,6 +6,8 @@
 #define GKO_COMMON_CUDA_HIP_SOLVER_BATCH_CG_KERNELS_HPP_
 
 
+#include "core/solver/batch_cg_kernels.hpp"
+
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
@@ -27,6 +29,11 @@
 namespace gko {
 namespace kernels {
 namespace GKO_DEVICE_NAMESPACE {
+
+
+constexpr int max_cg_threads = 1024;
+
+
 namespace batch_single_kernels {
 
 
@@ -113,14 +120,14 @@ __device__ __forceinline__ void update_x_and_r(
 template <typename StopType, const int n_shared, const bool prec_shared_bool,
           typename PrecType, typename LogType, typename BatchMatrixType,
           typename ValueType>
-__global__ void apply_kernel(const gko::kernels::batch_cg::storage_config sconf,
-                             const int max_iter,
-                             const gko::remove_complex<ValueType> tol,
-                             LogType logger, PrecType prec_shared,
-                             const BatchMatrixType mat,
-                             const ValueType* const __restrict__ b,
-                             ValueType* const __restrict__ x,
-                             ValueType* const __restrict__ workspace = nullptr)
+__global__ void __launch_bounds__(max_cg_threads)
+    apply_kernel(const gko::kernels::batch_cg::storage_config sconf,
+                 const int max_iter, const gko::remove_complex<ValueType> tol,
+                 LogType logger, PrecType prec_shared,
+                 const BatchMatrixType mat,
+                 const ValueType* const __restrict__ b,
+                 ValueType* const __restrict__ x,
+                 ValueType* const __restrict__ workspace = nullptr)
 {
     using real_type = typename gko::remove_complex<ValueType>;
     const auto num_batch_items = mat.num_batch_items;
diff --git a/core/matrix/batch_struct.hpp b/core/matrix/batch_struct.hpp
index a3604fd9b99..13543ccb624 100644
--- a/core/matrix/batch_struct.hpp
+++ b/core/matrix/batch_struct.hpp
@@ -22,14 +22,14 @@ namespace csr {
 /**
  * Encapsulates one matrix from a batch of csr matrices.
  */
-template <typename ValueType, typename IndexType>
+template <typename ValueType, typename IndexType = const int32>
 struct batch_item {
     using value_type = ValueType;
     using index_type = IndexType;
 
     ValueType* values;
-    const index_type* col_idxs;
-    const index_type* row_ptrs;
+    index_type* col_idxs;
+    index_type* row_ptrs;
     index_type num_rows;
     index_type num_cols;
     index_type num_nnz_per_item;
@@ -44,15 +44,15 @@ struct batch_item {
 /**
  * A 'simple' structure to store a global uniform batch of csr matrices.
  */
-template <typename ValueType, typename IndexType>
+template <typename ValueType, typename IndexType = const int32>
 struct uniform_batch {
     using value_type = ValueType;
     using index_type = IndexType;
     using entry_type = batch_item<value_type, index_type>;
 
     ValueType* values;
-    const index_type* col_idxs;
-    const index_type* row_ptrs;
+    index_type* col_idxs;
+    index_type* row_ptrs;
     size_type num_batch_items;
     index_type num_rows;
     index_type num_cols;
@@ -119,13 +119,13 @@ namespace ell {
 /**
  * Encapsulates one matrix from a batch of ell matrices.
  */
-template <typename ValueType, typename IndexType>
+template <typename ValueType, typename IndexType = const int32>
 struct batch_item {
     using value_type = ValueType;
     using index_type = IndexType;
 
     ValueType* values;
-    const index_type* col_idxs;
+    index_type* col_idxs;
     index_type stride;
     index_type num_rows;
     index_type num_cols;
@@ -141,14 +141,14 @@ struct batch_item {
 /**
  * A 'simple' structure to store a global uniform batch of ell matrices.
  */
-template <typename ValueType, typename IndexType>
+template <typename ValueType, typename IndexType = const int32>
 struct uniform_batch {
     using value_type = ValueType;
     using index_type = IndexType;
     using entry_type = batch_item<value_type, index_type>;
 
     ValueType* values;
-    const index_type* col_idxs;
+    index_type* col_idxs;
     size_type num_batch_items;
     index_type stride;
     index_type num_rows;
diff --git a/core/solver/batch_dispatch.hpp b/core/solver/batch_dispatch.hpp
index 018a6674df5..3e3fd01a03c 100644
--- a/core/solver/batch_dispatch.hpp
+++ b/core/solver/batch_dispatch.hpp
@@ -164,6 +164,45 @@ enum class log_type { simple_convergence_completion };
 }  // namespace log
 
 
+#define GKO_INDIRECT(...) __VA_ARGS__
+
+
+#define GKO_BATCH_INSTANTIATE_STOP(macro, ...)                               \
+    GKO_INDIRECT(                                                            \
+        macro(__VA_ARGS__,                                                   \
+              ::gko::batch::solver::device::batch_stop::SimpleAbsResidual)); \
+    template GKO_INDIRECT(                                                   \
+        macro(__VA_ARGS__,                                                   \
+              ::gko::batch::solver::device::batch_stop::SimpleRelResidual))
+
+#define GKO_BATCH_INSTANTIATE_PRECONDITIONER(macro, ...)                   \
+    GKO_BATCH_INSTANTIATE_STOP(                                            \
+        macro, __VA_ARGS__,                                                \
+        ::gko::batch::solver::device::batch_preconditioner::Identity);     \
+    template GKO_BATCH_INSTANTIATE_STOP(                                   \
+        macro, __VA_ARGS__,                                                \
+        ::gko::batch::solver::device::batch_preconditioner::ScalarJacobi); \
+    template GKO_BATCH_INSTANTIATE_STOP(                                   \
+        macro, __VA_ARGS__,                                                \
+        ::gko::batch::solver::device::batch_preconditioner::BlockJacobi)
+
+#define GKO_BATCH_INSTANTIATE_LOGGER(macro, ...) \
+    GKO_BATCH_INSTANTIATE_PRECONDITIONER(        \
+        macro, __VA_ARGS__,                      \
+        ::gko::batch::solver::device::batch_log::SimpleFinalLogger)
+
+#define GKO_BATCH_INSTANTIATE_MATRIX(macro, ...)                     \
+    GKO_BATCH_INSTANTIATE_LOGGER(macro, __VA_ARGS__,                 \
+                                 batch::matrix::ell::uniform_batch); \
+    template GKO_BATCH_INSTANTIATE_LOGGER(                           \
+        macro, __VA_ARGS__, batch::matrix::dense::uniform_batch);    \
+    template GKO_BATCH_INSTANTIATE_LOGGER(macro, __VA_ARGS__,        \
+                                          batch::matrix::csr::uniform_batch)
+
+#define GKO_BATCH_INSTANTIATE(macro, ...) \
+    GKO_BATCH_INSTANTIATE_MATRIX(macro, __VA_ARGS__)
+
+
 /**
  * Handles dispatching to the correct instantiation of a batched solver
  * depending on runtime parameters.
diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
index 000cb7b215f..bfa65eee79b 100644
--- a/cuda/CMakeLists.txt
+++ b/cuda/CMakeLists.txt
@@ -3,6 +3,8 @@ add_library(ginkgo_cuda $<TARGET_OBJECTS:ginkgo_cuda_device> "")
 include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake)
 add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/csr_kernels.instantiate.cpp CSR_INSTANTIATE)
 add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/fbcsr_kernels.instantiate.cpp FBCSR_INSTANTIATE)
+add_instantiation_files(. solver/batch_bicgstab_launch.instantiate.cu BATCH_BICGSTAB_INSTANTIATE)
+add_instantiation_files(. solver/batch_cg_launch.instantiate.cu BATCH_CG_INSTANTIATE)
 # we don't split up the dense kernels into distinct compilations
 list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp)
 target_sources(ginkgo_cuda
@@ -21,7 +23,9 @@ target_sources(ginkgo_cuda
     matrix/fft_kernels.cu
     preconditioner/batch_jacobi_kernels.cu
     solver/batch_bicgstab_kernels.cu
+    ${BATCH_BICGSTAB_INSTANTIATE}
     solver/batch_cg_kernels.cu
+    ${BATCH_CG_INSTANTIATE}
     solver/lower_trs_kernels.cu
     solver/upper_trs_kernels.cu
     ${GKO_UNIFIED_COMMON_SOURCES}
diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu
index 8a5eee6b196..bd07259f771 100644
--- a/cuda/solver/batch_bicgstab_kernels.cu
+++ b/cuda/solver/batch_bicgstab_kernels.cu
@@ -5,16 +5,13 @@
 #include "core/solver/batch_bicgstab_kernels.hpp"
 
 #include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/math.hpp>
 
-#include "common/cuda_hip/base/batch_struct.hpp"
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
 #include "common/cuda_hip/matrix/batch_struct.hpp"
 #include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp"
 #include "core/base/batch_struct.hpp"
-#include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
+#include "cuda/solver/batch_bicgstab_launch.cuh"
 
 
 namespace gko {
@@ -23,194 +20,121 @@ namespace cuda {
 namespace batch_bicgstab {
 
 
-template <typename StopType, typename PrecType, typename LogType,
-          typename BatchMatrixType, typename ValueType>
-int get_num_threads_per_block(std::shared_ptr<const DefaultExecutor> exec,
-                              const int num_rows)
-{
-    int num_warps = std::max(num_rows / 4, 2);
-    constexpr int warp_sz = static_cast<int>(config::warp_size);
-    const int min_block_size = 2 * warp_sz;
-    const int device_max_threads =
-        ((std::max(num_rows, min_block_size)) / warp_sz) * warp_sz;
-    cudaFuncAttributes funcattr;
-    cudaFuncGetAttributes(
-        &funcattr,
-        batch_single_kernels::apply_kernel<StopType, 9, true, PrecType, LogType,
-                                           BatchMatrixType, ValueType>);
-    const int num_regs_used = funcattr.numRegs;
-    int max_regs_blk = 0;
-    cudaDeviceGetAttribute(&max_regs_blk, cudaDevAttrMaxRegistersPerBlock,
-                           exec->get_device_id());
-    const int max_threads_regs =
-        ((max_regs_blk / static_cast<int>(num_regs_used)) / warp_sz) * warp_sz;
-    int max_threads = std::min(max_threads_regs, device_max_threads);
-    max_threads = max_threads <= 1024 ? max_threads : 1024;
-    return std::max(std::min(num_warps * warp_sz, max_threads), min_block_size);
-}
-
-
-template <typename StopType, typename PrecType, typename LogType,
-          typename BatchMatrixType, typename ValueType>
-int get_max_dynamic_shared_memory(std::shared_ptr<const DefaultExecutor> exec)
-{
-    int shmem_per_sm = 0;
-    cudaDeviceGetAttribute(&shmem_per_sm,
-                           cudaDevAttrMaxSharedMemoryPerMultiprocessor,
-                           exec->get_device_id());
-    GKO_ASSERT_NO_CUDA_ERRORS(cudaFuncSetAttribute(
-        batch_single_kernels::apply_kernel<StopType, 9, true, PrecType, LogType,
-                                           BatchMatrixType, ValueType>,
-        cudaFuncAttributePreferredSharedMemoryCarveout, 99 /*%*/));
-    cudaFuncAttributes funcattr;
-    cudaFuncGetAttributes(
-        &funcattr,
-        batch_single_kernels::apply_kernel<StopType, 9, true, PrecType, LogType,
-                                           BatchMatrixType, ValueType>);
-    return funcattr.maxDynamicSharedSizeBytes;
-}
-
-
-template <typename T>
-using settings = gko::kernels::batch_bicgstab::settings<T>;
-
-
-template <typename CuValueType>
+template <typename ValueType>
 class kernel_caller {
 public:
-    using value_type = CuValueType;
+    using cuda_value_type = cuda_type<ValueType>;
 
     kernel_caller(std::shared_ptr<const DefaultExecutor> exec,
-                  const settings<remove_complex<value_type>> settings)
+                  const settings<remove_complex<ValueType>> settings)
         : exec_{std::move(exec)}, settings_{settings}
     {}
 
-    template <typename StopType, const int n_shared,
-              const bool prec_shared_bool, typename PrecType, typename LogType,
-              typename BatchMatrixType>
-    void launch_apply_kernel(
-        const gko::kernels::batch_bicgstab::storage_config& sconf,
-        LogType& logger, PrecType& prec, const BatchMatrixType& mat,
-        const value_type* const __restrict__ b_values,
-        value_type* const __restrict__ x_values,
-        value_type* const __restrict__ workspace_data, const int& block_size,
-        const size_t& shared_size) const
-    {
-        batch_single_kernels::apply_kernel<StopType, n_shared, prec_shared_bool>
-            <<<mat.num_batch_items, block_size, shared_size,
-               exec_->get_stream()>>>(sconf, settings_.max_iterations,
-                                      settings_.residual_tol, logger, prec, mat,
-                                      b_values, x_values, workspace_data);
-    }
-
-
     template <typename BatchMatrixType, typename PrecType, typename StopType,
               typename LogType>
     void call_kernel(
         LogType logger, const BatchMatrixType& mat, PrecType prec,
-        const gko::batch::multi_vector::uniform_batch<const value_type>& b,
-        const gko::batch::multi_vector::uniform_batch<value_type>& x) const
+        const gko::batch::multi_vector::uniform_batch<const cuda_value_type>& b,
+        const gko::batch::multi_vector::uniform_batch<cuda_value_type>& x) const
     {
-        using real_type = gko::remove_complex<value_type>;
+        using real_type = gko::remove_complex<cuda_value_type>;
         const size_type num_batch_items = mat.num_batch_items;
         constexpr int align_multiple = 8;
         const int padded_num_rows =
             ceildiv(mat.num_rows, align_multiple) * align_multiple;
         const int shmem_per_blk =
             get_max_dynamic_shared_memory<StopType, PrecType, LogType,
-                                          BatchMatrixType, value_type>(exec_);
-        // TODO
-        const int block_size = 256;
-        // get_num_threads_per_block<StopType, PrecType, LogType,
-        //                           BatchMatrixType, value_type>(
-        //     exec_, mat.num_rows);
+                                          BatchMatrixType, cuda_value_type>(
+                exec_);
+        const int block_size =
+            get_num_threads_per_block<StopType, PrecType, LogType,
+                                      BatchMatrixType, cuda_value_type>(
+                exec_, mat.num_rows);
         GKO_ASSERT(block_size >= 2 * config::warp_size);
 
         const size_t prec_size = PrecType::dynamic_work_size(
             padded_num_rows, mat.get_single_item_num_nnz());
-        const auto sconf =
-            gko::kernels::batch_bicgstab::compute_shared_storage<PrecType,
-                                                                 value_type>(
-                shmem_per_blk, padded_num_rows, mat.get_single_item_num_nnz(),
-                b.num_rhs);
+        const auto sconf = gko::kernels::batch_bicgstab::compute_shared_storage<
+            PrecType, cuda_value_type>(shmem_per_blk, padded_num_rows,
+                                       mat.get_single_item_num_nnz(),
+                                       b.num_rhs);
         const size_t shared_size =
-            sconf.n_shared * padded_num_rows * sizeof(value_type) +
+            sconf.n_shared * padded_num_rows * sizeof(cuda_value_type) +
             (sconf.prec_shared ? prec_size : 0);
-        auto workspace = gko::array<value_type>(
-            exec_,
-            sconf.gmem_stride_bytes * num_batch_items / sizeof(value_type));
-        GKO_ASSERT(sconf.gmem_stride_bytes % sizeof(value_type) == 0);
+        auto workspace = gko::array<cuda_value_type>(
+            exec_, sconf.gmem_stride_bytes * num_batch_items /
+                       sizeof(cuda_value_type));
+        GKO_ASSERT(sconf.gmem_stride_bytes % sizeof(cuda_value_type) == 0);
 
-        value_type* const workspace_data = workspace.get_data();
+        cuda_value_type* const workspace_data = workspace.get_data();
 
-        // TODO: split compilation
         // Template parameters launch_apply_kernel<StopType, n_shared,
         // prec_shared>
-        // if (sconf.prec_shared) {
-        //     launch_apply_kernel<StopType, 9, true>(
-        //         sconf, logger, prec, mat, b.values, x.values, workspace_data,
-        //         block_size, shared_size);
-        // } else {
-        //     switch (sconf.n_shared) {
-        // case 0:
-        launch_apply_kernel<StopType, 0, false>(
-            sconf, logger, prec, mat, b.values, x.values, workspace_data,
-            block_size, shared_size);
-        //         break;
-        //     case 1:
-        //         launch_apply_kernel<StopType, 1, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 2:
-        //         launch_apply_kernel<StopType, 2, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 3:
-        //         launch_apply_kernel<StopType, 3, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 4:
-        //         launch_apply_kernel<StopType, 4, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 5:
-        //         launch_apply_kernel<StopType, 5, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 6:
-        //         launch_apply_kernel<StopType, 6, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 7:
-        //         launch_apply_kernel<StopType, 7, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 8:
-        //         launch_apply_kernel<StopType, 8, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 9:
-        //         launch_apply_kernel<StopType, 9, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     default:
-        //         GKO_NOT_IMPLEMENTED;
-        //     }
-        // }
+        if (sconf.prec_shared) {
+            launch_apply_kernel<cuda_value_type, 9, true, StopType>(
+                exec_, sconf, settings_, logger, prec, mat, b.values, x.values,
+                workspace_data, block_size, shared_size);
+        } else {
+            switch (sconf.n_shared) {
+            case 0:
+                launch_apply_kernel<cuda_value_type, 0, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 1:
+                launch_apply_kernel<cuda_value_type, 1, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 2:
+                launch_apply_kernel<cuda_value_type, 2, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 3:
+                launch_apply_kernel<cuda_value_type, 3, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 4:
+                launch_apply_kernel<cuda_value_type, 4, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 5:
+                launch_apply_kernel<cuda_value_type, 5, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 6:
+                launch_apply_kernel<cuda_value_type, 6, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 7:
+                launch_apply_kernel<cuda_value_type, 7, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 8:
+                launch_apply_kernel<cuda_value_type, 8, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 9:
+                launch_apply_kernel<cuda_value_type, 9, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            default:
+                GKO_NOT_IMPLEMENTED;
+            }
+        }
     }
 
 private:
     std::shared_ptr<const DefaultExecutor> exec_;
-    const settings<remove_complex<value_type>> settings_;
+    const settings<remove_complex<ValueType>> settings_;
 };
 
 
@@ -223,9 +147,8 @@ void apply(std::shared_ptr<const DefaultExecutor> exec,
            batch::MultiVector<ValueType>* const x,
            batch::log::detail::log_data<remove_complex<ValueType>>& logdata)
 {
-    using cu_value_type = cuda_type<ValueType>;
     auto dispatcher = batch::solver::create_dispatcher<ValueType>(
-        kernel_caller<cu_value_type>(exec, settings), settings, mat, precon);
+        kernel_caller<ValueType>(exec, settings), settings, mat, precon);
     dispatcher.apply(b, x, logdata);
 }
 
diff --git a/cuda/solver/batch_bicgstab_launch.cuh b/cuda/solver/batch_bicgstab_launch.cuh
new file mode 100644
index 00000000000..5106b21251e
--- /dev/null
+++ b/cuda/solver/batch_bicgstab_launch.cuh
@@ -0,0 +1,119 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_CUDA_SOLVER_BATCH_BICGSTAB_LAUNCH_CUH_
+#define GKO_CUDA_SOLVER_BATCH_BICGSTAB_LAUNCH_CUH_
+
+
+#include "common/cuda_hip/base/batch_struct.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/matrix/batch_struct.hpp"
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "core/solver/batch_bicgstab_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+namespace batch_bicgstab {
+
+
+template <typename T>
+using settings = gko::kernels::batch_bicgstab::settings<T>;
+
+
+template <typename StopType, typename PrecType, typename LogType,
+          typename BatchMatrixType, typename ValueType>
+int get_num_threads_per_block(std::shared_ptr<const DefaultExecutor> exec,
+                              const int num_rows);
+
+#define GKO_DECLARE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK_(              \
+    _vtype, mat_t, log_t, pre_t, stop_t)                                    \
+    int get_num_threads_per_block<                                          \
+        stop_t<cuda_type<_vtype>>, pre_t<cuda_type<_vtype>>,                \
+        log_t<gko::remove_complex<_vtype>>, mat_t<const cuda_type<_vtype>>, \
+        cuda_type<_vtype>>(std::shared_ptr<const DefaultExecutor> exec,     \
+                           const int num_rows)
+
+#define GKO_DECLARE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK(_vtype) \
+    GKO_BATCH_INSTANTIATE(                                           \
+        GKO_DECLARE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK_, _vtype)
+
+
+template <typename StopType, typename PrecType, typename LogType,
+          typename BatchMatrixType, typename ValueType>
+int get_max_dynamic_shared_memory(std::shared_ptr<const DefaultExecutor> exec);
+
+#define GKO_DECLARE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY_(          \
+    _vtype, mat_t, log_t, pre_t, stop_t)                                    \
+    int get_max_dynamic_shared_memory<                                      \
+        stop_t<cuda_type<_vtype>>, pre_t<cuda_type<_vtype>>,                \
+        log_t<gko::remove_complex<_vtype>>, mat_t<const cuda_type<_vtype>>, \
+        cuda_type<_vtype>>(std::shared_ptr<const DefaultExecutor> exec)
+
+#define GKO_DECLARE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY(_vtype) \
+    GKO_BATCH_INSTANTIATE(                                               \
+        GKO_DECLARE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY_, _vtype)
+
+
+template <typename ValueType, int n_shared, bool prec_shared, typename StopType,
+          typename PrecType, typename LogType, typename BatchMatrixType>
+void launch_apply_kernel(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const gko::kernels::batch_bicgstab::storage_config& sconf,
+    const settings<remove_complex<ValueType>>& settings, LogType& logger,
+    PrecType& prec, const BatchMatrixType& mat,
+    const ValueType* const __restrict__ b_values,
+    ValueType* const __restrict__ x_values,
+    ValueType* const __restrict__ workspace_data, const int& block_size,
+    const size_t& shared_size);
+
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH(_vtype, _n_shared, _prec_shared, \
+                                          mat_t, log_t, pre_t, stop_t)     \
+    void launch_apply_kernel<cuda_type<_vtype>, _n_shared, _prec_shared,   \
+                             stop_t<cuda_type<_vtype>>>(                   \
+        std::shared_ptr<const DefaultExecutor> exec,                       \
+        const gko::kernels::batch_bicgstab::storage_config& sconf,         \
+        const settings<remove_complex<cuda_type<_vtype>>>& settings,       \
+        log_t<gko::remove_complex<cuda_type<_vtype>>>& logger,             \
+        pre_t<cuda_type<_vtype>>& prec,                                    \
+        const mat_t<const cuda_type<_vtype>>& mat,                         \
+        const cuda_type<_vtype>* const __restrict__ b_values,              \
+        cuda_type<_vtype>* const __restrict__ x_values,                    \
+        cuda_type<_vtype>* const __restrict__ workspace_data,              \
+        const int& block_size, const size_t& shared_size)
+
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_0_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 0, false)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_1_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 1, false)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_2_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 2, false)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_3_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 3, false)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_4_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 4, false)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_5_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 5, false)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_6_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 6, false)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_7_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 7, false)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_8_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 8, false)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 9, false)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_TRUE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 9, true)
+
+
+}  // namespace batch_bicgstab
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif
diff --git a/cuda/solver/batch_bicgstab_launch.instantiate.cu b/cuda/solver/batch_bicgstab_launch.instantiate.cu
new file mode 100644
index 00000000000..ad17394c4a9
--- /dev/null
+++ b/cuda/solver/batch_bicgstab_launch.instantiate.cu
@@ -0,0 +1,131 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "cuda/solver/batch_bicgstab_launch.cuh"
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+#include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "core/solver/batch_bicgstab_kernels.hpp"
+#include "core/solver/batch_dispatch.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+namespace batch_bicgstab {
+
+
+template <typename StopType, typename PrecType, typename LogType,
+          typename BatchMatrixType, typename ValueType>
+int get_num_threads_per_block(std::shared_ptr<const DefaultExecutor> exec,
+                              const int num_rows)
+{
+    int num_warps = std::max(num_rows / 4, 2);
+    constexpr int warp_sz = static_cast<int>(config::warp_size);
+    const int min_block_size = 2 * warp_sz;
+    const int device_max_threads =
+        (std::max(num_rows, min_block_size) / warp_sz) * warp_sz;
+    auto get_num_regs = [](const auto func) {
+        cudaFuncAttributes funcattr;
+        cudaFuncGetAttributes(&funcattr, func);
+        return funcattr.numRegs;
+    };
+    const int num_regs_used = std::max(
+        get_num_regs(
+            batch_single_kernels::apply_kernel<StopType, 9, true, PrecType,
+                                               LogType, BatchMatrixType,
+                                               ValueType>),
+        get_num_regs(
+            batch_single_kernels::apply_kernel<StopType, 0, false, PrecType,
+                                               LogType, BatchMatrixType,
+                                               ValueType>));
+    int max_regs_blk = 0;
+    cudaDeviceGetAttribute(&max_regs_blk, cudaDevAttrMaxRegistersPerBlock,
+                           exec->get_device_id());
+    const int max_threads_regs =
+        ((max_regs_blk / static_cast<int>(num_regs_used)) / warp_sz) * warp_sz;
+    int max_threads = std::min(max_threads_regs, device_max_threads);
+    max_threads = max_threads <= max_bicgstab_threads ? max_threads
+                                                      : max_bicgstab_threads;
+    return std::max(std::min(num_warps * warp_sz, max_threads), min_block_size);
+}
+
+
+template <typename StopType, typename PrecType, typename LogType,
+          typename BatchMatrixType, typename ValueType>
+int get_max_dynamic_shared_memory(std::shared_ptr<const DefaultExecutor> exec)
+{
+    int shmem_per_sm = 0;
+    cudaDeviceGetAttribute(&shmem_per_sm,
+                           cudaDevAttrMaxSharedMemoryPerMultiprocessor,
+                           exec->get_device_id());
+    GKO_ASSERT_NO_CUDA_ERRORS(cudaFuncSetAttribute(
+        batch_single_kernels::apply_kernel<StopType, 9, true, PrecType, LogType,
+                                           BatchMatrixType, ValueType>,
+        cudaFuncAttributePreferredSharedMemoryCarveout, 99 /*%*/));
+    cudaFuncAttributes funcattr;
+    cudaFuncGetAttributes(
+        &funcattr,
+        batch_single_kernels::apply_kernel<StopType, 9, true, PrecType, LogType,
+                                           BatchMatrixType, ValueType>);
+    return funcattr.maxDynamicSharedSizeBytes;
+}
+
+
+template <typename ValueType, int n_shared, bool prec_shared, typename StopType,
+          typename PrecType, typename LogType, typename BatchMatrixType>
+void launch_apply_kernel(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const gko::kernels::batch_bicgstab::storage_config& sconf,
+    const settings<remove_complex<ValueType>>& settings, LogType& logger,
+    PrecType& prec, const BatchMatrixType& mat,
+    const ValueType* const __restrict__ b_values,
+    ValueType* const __restrict__ x_values,
+    ValueType* const __restrict__ workspace_data, const int& block_size,
+    const size_t& shared_size)
+{
+    batch_single_kernels::apply_kernel<StopType, n_shared, prec_shared>
+        <<<mat.num_batch_items, block_size, shared_size, exec->get_stream()>>>(
+            sconf, settings.max_iterations, as_cuda_type(settings.residual_tol),
+            logger, prec, mat, b_values, x_values, workspace_data);
+}
+
+
+// begin
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_BATCH_BICGSTAB_GET_NUM_THREADS_PER_BLOCK);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_BATCH_BICGSTAB_GET_MAX_DYNAMIC_SHARED_MEMORY);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_0_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_1_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_2_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_3_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_4_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_5_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_6_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_7_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_8_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_TRUE);
+// end
+
+
+}  // namespace batch_bicgstab
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu
index 32e66d7ee54..126a62006cf 100644
--- a/cuda/solver/batch_cg_kernels.cu
+++ b/cuda/solver/batch_cg_kernels.cu
@@ -5,16 +5,13 @@
 #include "core/solver/batch_cg_kernels.hpp"
 
 #include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/math.hpp>
 
-#include "common/cuda_hip/base/batch_struct.hpp"
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
 #include "common/cuda_hip/matrix/batch_struct.hpp"
 #include "common/cuda_hip/solver/batch_cg_kernels.hpp"
 #include "core/base/batch_struct.hpp"
-#include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
+#include "cuda/solver/batch_cg_launch.cuh"
 
 
 namespace gko {
@@ -23,104 +20,35 @@ namespace cuda {
 namespace batch_cg {
 
 
-template <typename StopType, typename PrecType, typename LogType,
-          typename BatchMatrixType, typename ValueType>
-int get_num_threads_per_block(std::shared_ptr<const DefaultExecutor> exec,
-                              const int num_rows)
-{
-    int num_warps = std::max(num_rows / 4, 2);
-    constexpr int warp_sz = static_cast<int>(config::warp_size);
-    const int min_block_size = 2 * warp_sz;
-    const int device_max_threads =
-        (std::max(num_rows, min_block_size) / warp_sz) * warp_sz;
-    cudaFuncAttributes funcattr;
-    cudaFuncGetAttributes(
-        &funcattr,
-        batch_single_kernels::apply_kernel<StopType, 5, true, PrecType, LogType,
-                                           BatchMatrixType, ValueType>);
-    const int num_regs_used = funcattr.numRegs;
-    int max_regs_blk = 0;
-    cudaDeviceGetAttribute(&max_regs_blk, cudaDevAttrMaxRegistersPerBlock,
-                           exec->get_device_id());
-    const int max_threads_regs =
-        ((max_regs_blk / static_cast<int>(num_regs_used)) / warp_sz) * warp_sz;
-    int max_threads = std::min(max_threads_regs, device_max_threads);
-    max_threads = max_threads <= 1024 ? max_threads : 1024;
-    return std::max(std::min(num_warps * warp_sz, max_threads), min_block_size);
-}
-
-
-template <typename StopType, typename PrecType, typename LogType,
-          typename BatchMatrixType, typename ValueType>
-int get_max_dynamic_shared_memory(std::shared_ptr<const DefaultExecutor> exec)
-{
-    int shmem_per_sm = 0;
-    cudaDeviceGetAttribute(&shmem_per_sm,
-                           cudaDevAttrMaxSharedMemoryPerMultiprocessor,
-                           exec->get_device_id());
-    GKO_ASSERT_NO_CUDA_ERRORS(cudaFuncSetAttribute(
-        batch_single_kernels::apply_kernel<StopType, 5, true, PrecType, LogType,
-                                           BatchMatrixType, ValueType>,
-        cudaFuncAttributePreferredSharedMemoryCarveout, 99 /*%*/));
-    cudaFuncAttributes funcattr;
-    cudaFuncGetAttributes(
-        &funcattr,
-        batch_single_kernels::apply_kernel<StopType, 5, true, PrecType, LogType,
-                                           BatchMatrixType, ValueType>);
-    return funcattr.maxDynamicSharedSizeBytes;
-}
-
-
-template <typename T>
-using settings = gko::kernels::batch_cg::settings<T>;
-
-
-template <typename CuValueType>
+template <typename ValueType>
 class kernel_caller {
 public:
-    using value_type = CuValueType;
+    using cuda_value_type = cuda_type<ValueType>;
 
     kernel_caller(std::shared_ptr<const DefaultExecutor> exec,
-                  const settings<remove_complex<value_type>> settings)
+                  const settings<remove_complex<ValueType>> settings)
         : exec_{std::move(exec)}, settings_{settings}
     {}
 
-    template <typename StopType, const int n_shared,
-              const bool prec_shared_bool, typename PrecType, typename LogType,
-              typename BatchMatrixType>
-    void launch_apply_kernel(
-        const gko::kernels::batch_cg::storage_config& sconf, LogType& logger,
-        PrecType& prec, const BatchMatrixType& mat,
-        const value_type* const __restrict__ b_values,
-        value_type* const __restrict__ x_values,
-        value_type* const __restrict__ workspace_data, const int& block_size,
-        const size_t& shared_size) const
-    {
-        batch_single_kernels::apply_kernel<StopType, n_shared, prec_shared_bool>
-            <<<mat.num_batch_items, block_size, shared_size,
-               exec_->get_stream()>>>(sconf, settings_.max_iterations,
-                                      settings_.residual_tol, logger, prec, mat,
-                                      b_values, x_values, workspace_data);
-    }
-
     template <typename BatchMatrixType, typename PrecType, typename StopType,
               typename LogType>
     void call_kernel(
         LogType logger, const BatchMatrixType& mat, PrecType prec,
-        const gko::batch::multi_vector::uniform_batch<const value_type>& b,
-        const gko::batch::multi_vector::uniform_batch<value_type>& x) const
+        const gko::batch::multi_vector::uniform_batch<const cuda_value_type>& b,
+        const gko::batch::multi_vector::uniform_batch<cuda_value_type>& x) const
     {
-        using real_type = gko::remove_complex<value_type>;
+        using real_type = gko::remove_complex<cuda_value_type>;
         const size_type num_batch_items = mat.num_batch_items;
         constexpr int align_multiple = 8;
         const int padded_num_rows =
             ceildiv(mat.num_rows, align_multiple) * align_multiple;
         const int shmem_per_blk =
             get_max_dynamic_shared_memory<StopType, PrecType, LogType,
-                                          BatchMatrixType, value_type>(exec_);
+                                          BatchMatrixType, cuda_value_type>(
+                exec_);
         const int block_size =
             get_num_threads_per_block<StopType, PrecType, LogType,
-                                      BatchMatrixType, value_type>(
+                                      BatchMatrixType, cuda_value_type>(
                 exec_, mat.num_rows);
         GKO_ASSERT(block_size >= 2 * config::warp_size);
 
@@ -128,69 +56,66 @@ public:
             padded_num_rows, mat.get_single_item_num_nnz());
         const auto sconf =
             gko::kernels::batch_cg::compute_shared_storage<PrecType,
-                                                           value_type>(
+                                                           cuda_value_type>(
                 shmem_per_blk, padded_num_rows, mat.get_single_item_num_nnz(),
                 b.num_rhs);
         const size_t shared_size =
-            sconf.n_shared * padded_num_rows * sizeof(value_type) +
+            sconf.n_shared * padded_num_rows * sizeof(cuda_value_type) +
             (sconf.prec_shared ? prec_size : 0);
-        auto workspace = gko::array<value_type>(
-            exec_,
-            sconf.gmem_stride_bytes * num_batch_items / sizeof(value_type));
-        GKO_ASSERT(sconf.gmem_stride_bytes % sizeof(value_type) == 0);
-
-        value_type* const workspace_data = workspace.get_data();
-
-        // TODO: split compilation
-        // Only instantiate when full optimizations has been enabled. Otherwise,
-        // just use the default one with no shared memory.
-        // Template parameters launch_apply_kernel<StopType, n_shared,
-        // prec_shared>
-        // if (sconf.prec_shared) {
-        //     launch_apply_kernel<StopType, 5, true>(
-        //         sconf, logger, prec, mat, b.values, x.values, workspace_data,
-        //         block_size, shared_size);
-        // } else {
-        //     switch (sconf.n_shared) {
-        //     case 0:
-        launch_apply_kernel<StopType, 0, false>(
-            sconf, logger, prec, mat, b.values, x.values, workspace_data,
-            block_size, shared_size);
-        //         break;
-        //     case 1:
-        //         launch_apply_kernel<StopType, 1, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 2:
-        //         launch_apply_kernel<StopType, 2, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 3:
-        //         launch_apply_kernel<StopType, 3, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 4:
-        //         launch_apply_kernel<StopType, 4, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 5:
-        //         launch_apply_kernel<StopType, 5, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     default:
-        //         GKO_NOT_IMPLEMENTED;
-        //     }
-        // }
+        auto workspace = gko::array<cuda_value_type>(
+            exec_, sconf.gmem_stride_bytes * num_batch_items /
+                       sizeof(cuda_value_type));
+        GKO_ASSERT(sconf.gmem_stride_bytes % sizeof(cuda_value_type) == 0);
+
+        cuda_value_type* const workspace_data = workspace.get_data();
+
+        // Template parameters launch_apply_kernel<ValueType, n_shared,
+        // prec_shared, StopType>
+        if (sconf.prec_shared) {
+            launch_apply_kernel<cuda_value_type, 5, true, StopType>(
+                exec_, sconf, settings_, logger, prec, mat, b.values, x.values,
+                workspace_data, block_size, shared_size);
+        } else {
+            switch (sconf.n_shared) {
+            case 0:
+                launch_apply_kernel<cuda_value_type, 0, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 1:
+                launch_apply_kernel<cuda_value_type, 1, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 2:
+                launch_apply_kernel<cuda_value_type, 2, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 3:
+                launch_apply_kernel<cuda_value_type, 3, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 4:
+                launch_apply_kernel<cuda_value_type, 4, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 5:
+                launch_apply_kernel<cuda_value_type, 5, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            default:
+                GKO_NOT_IMPLEMENTED;
+            }
+        }
     }
 
 private:
     std::shared_ptr<const DefaultExecutor> exec_;
-    const settings<remove_complex<value_type>> settings_;
+    const settings<remove_complex<ValueType>> settings_;
 };
 
 
@@ -203,9 +128,8 @@ void apply(std::shared_ptr<const DefaultExecutor> exec,
            batch::MultiVector<ValueType>* const x,
            batch::log::detail::log_data<remove_complex<ValueType>>& logdata)
 {
-    using cu_value_type = cuda_type<ValueType>;
     auto dispatcher = batch::solver::create_dispatcher<ValueType>(
-        kernel_caller<cu_value_type>(exec, settings), settings, mat, precon);
+        kernel_caller<ValueType>(exec, settings), settings, mat, precon);
     dispatcher.apply(b, x, logdata);
 }
 
diff --git a/cuda/solver/batch_cg_launch.cuh b/cuda/solver/batch_cg_launch.cuh
new file mode 100644
index 00000000000..9cb470eb51b
--- /dev/null
+++ b/cuda/solver/batch_cg_launch.cuh
@@ -0,0 +1,111 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_CUDA_SOLVER_BATCH_CG_LAUNCH_CUH_
+#define GKO_CUDA_SOLVER_BATCH_CG_LAUNCH_CUH_
+
+
+#include "common/cuda_hip/base/batch_struct.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/matrix/batch_struct.hpp"
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "core/solver/batch_cg_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+namespace batch_cg {
+
+
+template <typename T>
+using settings = gko::kernels::batch_cg::settings<T>;
+
+
+template <typename StopType, typename PrecType, typename LogType,
+          typename BatchMatrixType, typename ValueType>
+int get_num_threads_per_block(std::shared_ptr<const DefaultExecutor> exec,
+                              const int num_rows);
+
+#define GKO_DECLARE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK_(_vtype, mat_t, log_t, \
+                                                        pre_t, stop_t)        \
+    int get_num_threads_per_block<                                            \
+        stop_t<cuda_type<_vtype>>, pre_t<cuda_type<_vtype>>,                  \
+        log_t<gko::remove_complex<cuda_type<_vtype>>>,                        \
+        mat_t<const cuda_type<_vtype>>, cuda_type<_vtype>>(                   \
+        std::shared_ptr<const DefaultExecutor> exec, const int num_rows)
+
+#define GKO_DECLARE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK(_vtype)             \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK_, \
+                          _vtype)
+
+
+template <typename StopType, typename PrecType, typename LogType,
+          typename BatchMatrixType, typename ValueType>
+int get_max_dynamic_shared_memory(std::shared_ptr<const DefaultExecutor> exec);
+
+#define GKO_DECLARE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY_(                \
+    _vtype, mat_t, log_t, pre_t, stop_t)                                    \
+    int get_max_dynamic_shared_memory<                                      \
+        stop_t<cuda_type<_vtype>>, pre_t<cuda_type<_vtype>>,                \
+        log_t<gko::remove_complex<_vtype>>, mat_t<const cuda_type<_vtype>>, \
+        cuda_type<_vtype>>(std::shared_ptr<const DefaultExecutor> exec)
+
+#define GKO_DECLARE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY(_vtype)             \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY_, \
+                          _vtype)
+
+
+template <typename ValueType, int n_shared, bool prec_shared, typename StopType,
+          typename PrecType, typename LogType, typename BatchMatrixType>
+void launch_apply_kernel(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const gko::kernels::batch_cg::storage_config& sconf,
+    const settings<remove_complex<ValueType>>& settings, LogType& logger,
+    PrecType& prec, const BatchMatrixType& mat,
+    const cuda_type<ValueType>* const __restrict__ b_values,
+    cuda_type<ValueType>* const __restrict__ x_values,
+    cuda_type<ValueType>* const __restrict__ workspace_data,
+    const int& block_size, const size_t& shared_size);
+
+#define GKO_DECLARE_BATCH_CG_LAUNCH(_vtype, _n_shared, _prec_shared, mat_t, \
+                                    log_t, pre_t, stop_t)                   \
+    void launch_apply_kernel<cuda_type<_vtype>, _n_shared, _prec_shared,    \
+                             stop_t<cuda_type<_vtype>>>(                    \
+        std::shared_ptr<const DefaultExecutor> exec,                        \
+        const gko::kernels::batch_cg::storage_config& sconf,                \
+        const settings<remove_complex<_vtype>>& settings,                   \
+        log_t<cuda_type<gko::remove_complex<cuda_type<_vtype>>>>& logger,   \
+        pre_t<cuda_type<_vtype>>& prec,                                     \
+        const mat_t<const cuda_type<_vtype>>& mat,                          \
+        const cuda_type<_vtype>* const __restrict__ b_values,               \
+        cuda_type<_vtype>* const __restrict__ x_values,                     \
+        cuda_type<_vtype>* const __restrict__ workspace_data,               \
+        const int& block_size, const size_t& shared_size)
+
+#define GKO_DECLARE_BATCH_CG_LAUNCH_0_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 0, false)
+#define GKO_DECLARE_BATCH_CG_LAUNCH_1_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 1, false)
+#define GKO_DECLARE_BATCH_CG_LAUNCH_2_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 2, false)
+#define GKO_DECLARE_BATCH_CG_LAUNCH_3_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 3, false)
+#define GKO_DECLARE_BATCH_CG_LAUNCH_4_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 4, false)
+#define GKO_DECLARE_BATCH_CG_LAUNCH_5_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 5, false)
+#define GKO_DECLARE_BATCH_CG_LAUNCH_5_TRUE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 5, true)
+
+
+}  // namespace batch_cg
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif
diff --git a/cuda/solver/batch_cg_launch.instantiate.cu b/cuda/solver/batch_cg_launch.instantiate.cu
new file mode 100644
index 00000000000..89e96e85ace
--- /dev/null
+++ b/cuda/solver/batch_cg_launch.instantiate.cu
@@ -0,0 +1,122 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "cuda/solver/batch_cg_launch.cuh"
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+#include "common/cuda_hip/solver/batch_cg_kernels.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "core/solver/batch_cg_kernels.hpp"
+#include "core/solver/batch_dispatch.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+namespace batch_cg {
+
+
+template <typename StopType, typename PrecType, typename LogType,
+          typename BatchMatrixType, typename ValueType>
+int get_num_threads_per_block(std::shared_ptr<const DefaultExecutor> exec,
+                              const int num_rows)
+{
+    int num_warps = std::max(num_rows / 4, 2);
+    constexpr int warp_sz = static_cast<int>(config::warp_size);
+    const int min_block_size = 2 * warp_sz;
+    const int device_max_threads =
+        (std::max(num_rows, min_block_size) / warp_sz) * warp_sz;
+    auto get_num_regs = [](const auto func) {
+        cudaFuncAttributes funcattr;
+        cudaFuncGetAttributes(&funcattr, func);
+        return funcattr.numRegs;
+    };
+    const int num_regs_used = std::max(
+        get_num_regs(
+            batch_single_kernels::apply_kernel<StopType, 5, true, PrecType,
+                                               LogType, BatchMatrixType,
+                                               ValueType>),
+        get_num_regs(
+            batch_single_kernels::apply_kernel<StopType, 0, false, PrecType,
+                                               LogType, BatchMatrixType,
+                                               ValueType>));
+    int max_regs_blk = 0;
+    cudaDeviceGetAttribute(&max_regs_blk, cudaDevAttrMaxRegistersPerBlock,
+                           exec->get_device_id());
+    const int max_threads_regs =
+        ((max_regs_blk / static_cast<int>(num_regs_used)) / warp_sz) * warp_sz;
+    int max_threads = std::min(max_threads_regs, device_max_threads);
+    max_threads = max_threads <= max_cg_threads ? max_threads : max_cg_threads;
+    return std::max(std::min(num_warps * warp_sz, max_threads), min_block_size);
+}
+
+
+template <typename StopType, typename PrecType, typename LogType,
+          typename BatchMatrixType, typename ValueType>
+int get_max_dynamic_shared_memory(std::shared_ptr<const DefaultExecutor> exec)
+{
+    int shmem_per_sm = 0;
+    cudaDeviceGetAttribute(&shmem_per_sm,
+                           cudaDevAttrMaxSharedMemoryPerMultiprocessor,
+                           exec->get_device_id());
+    GKO_ASSERT_NO_CUDA_ERRORS(cudaFuncSetAttribute(
+        batch_single_kernels::apply_kernel<StopType, 5, true, PrecType, LogType,
+                                           BatchMatrixType, ValueType>,
+        cudaFuncAttributePreferredSharedMemoryCarveout, 99 /*%*/));
+    cudaFuncAttributes funcattr;
+    cudaFuncGetAttributes(
+        &funcattr,
+        batch_single_kernels::apply_kernel<StopType, 5, true, PrecType, LogType,
+                                           BatchMatrixType, ValueType>);
+    return funcattr.maxDynamicSharedSizeBytes;
+}
+
+
+template <typename ValueType, int n_shared, bool prec_shared, typename StopType,
+          typename PrecType, typename LogType, typename BatchMatrixType>
+void launch_apply_kernel(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const gko::kernels::batch_cg::storage_config& sconf,
+    const settings<remove_complex<ValueType>>& settings, LogType& logger,
+    PrecType& prec, const BatchMatrixType& mat,
+    const cuda_type<ValueType>* const __restrict__ b_values,
+    cuda_type<ValueType>* const __restrict__ x_values,
+    cuda_type<ValueType>* const __restrict__ workspace_data,
+    const int& block_size, const size_t& shared_size)
+{
+    batch_single_kernels::apply_kernel<StopType, n_shared, prec_shared>
+        <<<mat.num_batch_items, block_size, shared_size, exec->get_stream()>>>(
+            sconf, settings.max_iterations, as_cuda_type(settings.residual_tol),
+            logger, prec, mat, b_values, x_values, workspace_data);
+}
+
+
+// begin
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_0_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_1_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_2_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_3_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_4_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_5_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_5_TRUE);
+// end
+
+
+}  // namespace batch_cg
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
index 7d914d57a81..4a540046322 100644
--- a/hip/CMakeLists.txt
+++ b/hip/CMakeLists.txt
@@ -2,6 +2,8 @@ cmake_minimum_required(VERSION 3.21)
 include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake)
 add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/csr_kernels.instantiate.cpp CSR_INSTANTIATE)
 add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/fbcsr_kernels.instantiate.cpp FBCSR_INSTANTIATE)
+add_instantiation_files(. solver/batch_bicgstab_launch.instantiate.hip.cpp BATCH_BICGSTAB_INSTANTIATE)
+add_instantiation_files(. solver/batch_cg_launch.instantiate.hip.cpp BATCH_CG_INSTANTIATE)
 # we don't split up the dense kernels into distinct compilations
 list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp)
 set(GINKGO_HIP_SOURCES
@@ -18,7 +20,9 @@ set(GINKGO_HIP_SOURCES
     ${FBCSR_INSTANTIATE}
     preconditioner/batch_jacobi_kernels.hip.cpp
     solver/batch_bicgstab_kernels.hip.cpp
+    ${BATCH_BICGSTAB_INSTANTIATE}
     solver/batch_cg_kernels.hip.cpp
+    ${BATCH_CG_INSTANTIATE}
     solver/lower_trs_kernels.hip.cpp
     solver/upper_trs_kernels.hip.cpp
     ${GKO_UNIFIED_COMMON_SOURCES}
diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp
index 17199d2cd19..697bcb94551 100644
--- a/hip/solver/batch_bicgstab_kernels.hip.cpp
+++ b/hip/solver/batch_bicgstab_kernels.hip.cpp
@@ -5,19 +5,13 @@
 #include "core/solver/batch_bicgstab_kernels.hpp"
 
 #include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/math.hpp>
 
 #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
-#include "common/cuda_hip/base/batch_struct.hpp"
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/math.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/matrix/batch_struct.hpp"
 #include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp"
 #include "core/base/batch_struct.hpp"
-#include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
+#include "hip/solver/batch_bicgstab_launch.hip.hpp"
 
 
 namespace gko {
@@ -51,47 +45,24 @@ int get_num_threads_per_block(std::shared_ptr<const DefaultExecutor> exec,
 }
 
 
-template <typename T>
-using settings = gko::kernels::batch_bicgstab::settings<T>;
-
-
-template <typename HipValueType>
+template <typename ValueType>
 class kernel_caller {
 public:
-    using value_type = HipValueType;
+    using hip_value_type = hip_type<ValueType>;
 
     kernel_caller(std::shared_ptr<const DefaultExecutor> exec,
-                  const settings<remove_complex<value_type>> settings)
+                  const settings<remove_complex<ValueType>> settings)
         : exec_{exec}, settings_{settings}
     {}
 
-    template <typename StopType, const int n_shared,
-              const bool prec_shared_bool, typename PrecType, typename LogType,
-              typename BatchMatrixType>
-    void launch_apply_kernel(
-        const gko::kernels::batch_bicgstab::storage_config& sconf,
-        LogType& logger, PrecType& prec, const BatchMatrixType& mat,
-        const value_type* const __restrict__ b_values,
-        value_type* const __restrict__ x_values,
-        value_type* const __restrict__ workspace_data, const int& block_size,
-        const size_t& shared_size) const
-    {
-        batch_single_kernels::apply_kernel<StopType, n_shared, prec_shared_bool>
-            <<<mat.num_batch_items, block_size, shared_size,
-               exec_->get_stream()>>>(sconf, settings_.max_iterations,
-                                      settings_.residual_tol, logger, prec, mat,
-                                      b_values, x_values, workspace_data);
-    }
-
-
     template <typename BatchMatrixType, typename PrecType, typename StopType,
               typename LogType>
     void call_kernel(
         LogType logger, const BatchMatrixType& mat, PrecType prec,
-        const gko::batch::multi_vector::uniform_batch<const value_type>& b,
-        const gko::batch::multi_vector::uniform_batch<value_type>& x) const
+        const gko::batch::multi_vector::uniform_batch<const hip_value_type>& b,
+        const gko::batch::multi_vector::uniform_batch<hip_value_type>& x) const
     {
-        using real_type = gko::remove_complex<value_type>;
+        using real_type = gko::remove_complex<hip_value_type>;
         const size_type num_batch_items = mat.num_batch_items;
         constexpr int align_multiple = 8;
         const int padded_num_rows =
@@ -102,99 +73,92 @@ class kernel_caller {
             exec_->get_device_id()));
         const int block_size =
             get_num_threads_per_block<BatchMatrixType>(exec_, mat.num_rows);
-        bool is_block_size_aligned = block_size % config::warp_size == 0;
         GKO_ASSERT(block_size >= 2 * config::warp_size);
-        GKO_ASSERT(is_block_size_aligned);
+        GKO_ASSERT(block_size % config::warp_size == 0);
 
         // Returns amount required in bytes
         const size_t prec_size = PrecType::dynamic_work_size(
             padded_num_rows, mat.get_single_item_num_nnz());
-        const auto sconf =
-            gko::kernels::batch_bicgstab::compute_shared_storage<PrecType,
-                                                                 value_type>(
-                shmem_per_blk, padded_num_rows, mat.get_single_item_num_nnz(),
-                b.num_rhs);
+        const auto sconf = gko::kernels::batch_bicgstab::compute_shared_storage<
+            PrecType, hip_value_type>(shmem_per_blk, padded_num_rows,
+                                      mat.get_single_item_num_nnz(), b.num_rhs);
         const size_t shared_size =
-            sconf.n_shared * padded_num_rows * sizeof(value_type) +
+            sconf.n_shared * padded_num_rows * sizeof(hip_value_type) +
             (sconf.prec_shared ? prec_size : 0);
-        auto workspace = gko::array<value_type>(
+        auto workspace = gko::array<hip_value_type>(
             exec_,
-            sconf.gmem_stride_bytes * num_batch_items / sizeof(value_type));
-        bool is_stride_aligned =
-            sconf.gmem_stride_bytes % sizeof(value_type) == 0;
-        GKO_ASSERT(is_stride_aligned);
+            sconf.gmem_stride_bytes * num_batch_items / sizeof(hip_value_type));
+        GKO_ASSERT(sconf.gmem_stride_bytes % sizeof(hip_value_type) == 0);
 
-        value_type* const workspace_data = workspace.get_data();
+        hip_value_type* const workspace_data = workspace.get_data();
 
-        // Only instantiate when full optimizations has been enabled. Otherwise,
-        // just use the default one with no shared memory.
         // Template parameters launch_apply_kernel<StopType, n_shared,
         // prec_shared)
-        // if (sconf.prec_shared) {
-        //     launch_apply_kernel<StopType, 9, true>(
-        //         sconf, logger, prec, mat, b.values, x.values, workspace_data,
-        //         block_size, shared_size);
-        // } else {
-        //     switch (sconf.n_shared) {
-        //     case 0:
-        launch_apply_kernel<StopType, 0, false>(
-            sconf, logger, prec, mat, b.values, x.values, workspace_data,
-            block_size, shared_size);
-        //         break;
-        //     case 1:
-        //         launch_apply_kernel<StopType, 1, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 2:
-        //         launch_apply_kernel<StopType, 2, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 3:
-        //         launch_apply_kernel<StopType, 3, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 4:
-        //         launch_apply_kernel<StopType, 4, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 5:
-        //         launch_apply_kernel<StopType, 5, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 6:
-        //         launch_apply_kernel<StopType, 6, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 7:
-        //         launch_apply_kernel<StopType, 7, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 8:
-        //         launch_apply_kernel<StopType, 8, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 9:
-        //         launch_apply_kernel<StopType, 9, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     default:
-        //         GKO_NOT_IMPLEMENTED;
-        //     }
-        // }
+        if (sconf.prec_shared) {
+            launch_apply_kernel<ValueType, 9, true, StopType>(
+                exec_, sconf, settings_, logger, prec, mat, b.values, x.values,
+                workspace_data, block_size, shared_size);
+        } else {
+            switch (sconf.n_shared) {
+            case 0:
+                launch_apply_kernel<ValueType, 0, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 1:
+                launch_apply_kernel<ValueType, 1, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 2:
+                launch_apply_kernel<ValueType, 2, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 3:
+                launch_apply_kernel<ValueType, 3, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 4:
+                launch_apply_kernel<ValueType, 4, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 5:
+                launch_apply_kernel<ValueType, 5, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 6:
+                launch_apply_kernel<ValueType, 6, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 7:
+                launch_apply_kernel<ValueType, 7, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 8:
+                launch_apply_kernel<ValueType, 8, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 9:
+                launch_apply_kernel<ValueType, 9, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            default:
+                GKO_NOT_IMPLEMENTED;
+            }
+        }
     }
 
 private:
     std::shared_ptr<const DefaultExecutor> exec_;
-    const settings<remove_complex<value_type>> settings_;
+    const settings<remove_complex<ValueType>> settings_;
 };
 
 
@@ -207,9 +171,8 @@ void apply(std::shared_ptr<const DefaultExecutor> exec,
            batch::MultiVector<ValueType>* const x,
            batch::log::detail::log_data<remove_complex<ValueType>>& logdata)
 {
-    using hip_value_type = hip_type<ValueType>;
     auto dispatcher = batch::solver::create_dispatcher<ValueType>(
-        kernel_caller<hip_value_type>(exec, settings), settings, mat, precon);
+        kernel_caller<ValueType>(exec, settings), settings, mat, precon);
     dispatcher.apply(b, x, logdata);
 }
 
diff --git a/hip/solver/batch_bicgstab_launch.hip.hpp b/hip/solver/batch_bicgstab_launch.hip.hpp
new file mode 100644
index 00000000000..0f62a9487a3
--- /dev/null
+++ b/hip/solver/batch_bicgstab_launch.hip.hpp
@@ -0,0 +1,85 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_HIP_SOLVER_BATCH_BICGSTAB_LAUNCH_HIP_HPP_
+#define GKO_HIP_SOLVER_BATCH_BICGSTAB_LAUNCH_HIP_HPP_
+
+
+#include "common/cuda_hip/base/batch_struct.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/matrix/batch_struct.hpp"
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "core/solver/batch_bicgstab_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+namespace batch_bicgstab {
+
+
+template <typename T>
+using settings = gko::kernels::batch_bicgstab::settings<T>;
+
+
+template <typename ValueType, int n_shared, bool prec_shared, typename StopType,
+          typename PrecType, typename LogType, typename BatchMatrixType>
+void launch_apply_kernel(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const gko::kernels::batch_bicgstab::storage_config& sconf,
+    const settings<remove_complex<ValueType>>& settings, LogType& logger,
+    PrecType& prec, const BatchMatrixType& mat,
+    const hip_type<ValueType>* const __restrict__ b_values,
+    hip_type<ValueType>* const __restrict__ x_values,
+    hip_type<ValueType>* const __restrict__ workspace_data,
+    const int& block_size, const size_t& shared_size);
+
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH(_vtype, _n_shared, _prec_shared, \
+                                          mat_t, log_t, pre_t, stop_t)     \
+    void launch_apply_kernel<_vtype, _n_shared, _prec_shared,              \
+                             stop_t<hip_type<_vtype>>>(                    \
+        std::shared_ptr<const DefaultExecutor> exec,                       \
+        const gko::kernels::batch_bicgstab::storage_config& sconf,         \
+        const settings<remove_complex<_vtype>>& settings,                  \
+        log_t<hip_type<gko::remove_complex<_vtype>>>& logger,              \
+        pre_t<hip_type<_vtype>>& prec,                                     \
+        const mat_t<const hip_type<_vtype>>& mat,                          \
+        const hip_type<_vtype>* const __restrict__ b_values,               \
+        hip_type<_vtype>* const __restrict__ x_values,                     \
+        hip_type<_vtype>* const __restrict__ workspace_data,               \
+        const int& block_size, const size_t& shared_size)
+
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_0_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 0, false)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_1_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 1, false)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_2_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 2, false)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_3_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 3, false)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_4_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 4, false)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_5_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 5, false)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_6_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 6, false)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_7_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 7, false)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_8_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 8, false)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 9, false)
+#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_TRUE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 9, true)
+
+
+}  // namespace batch_bicgstab
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif
diff --git a/hip/solver/batch_bicgstab_launch.instantiate.hip.cpp b/hip/solver/batch_bicgstab_launch.instantiate.hip.cpp
new file mode 100644
index 00000000000..fb26c562a94
--- /dev/null
+++ b/hip/solver/batch_bicgstab_launch.instantiate.hip.cpp
@@ -0,0 +1,67 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+#include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "core/solver/batch_bicgstab_kernels.hpp"
+#include "core/solver/batch_dispatch.hpp"
+#include "hip/solver/batch_bicgstab_launch.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+namespace batch_bicgstab {
+
+
+template <typename ValueType, int n_shared, bool prec_shared, typename StopType,
+          typename PrecType, typename LogType, typename BatchMatrixType>
+void launch_apply_kernel(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const gko::kernels::batch_bicgstab::storage_config& sconf,
+    const settings<remove_complex<ValueType>>& settings, LogType& logger,
+    PrecType& prec, const BatchMatrixType& mat,
+    const hip_type<ValueType>* const __restrict__ b_values,
+    hip_type<ValueType>* const __restrict__ x_values,
+    hip_type<ValueType>* const __restrict__ workspace_data,
+    const int& block_size, const size_t& shared_size)
+{
+    batch_single_kernels::apply_kernel<StopType, n_shared, prec_shared>
+        <<<mat.num_batch_items, block_size, shared_size, exec->get_stream()>>>(
+            sconf, settings.max_iterations, as_hip_type(settings.residual_tol),
+            logger, prec, mat, b_values, x_values, workspace_data);
+}
+
+
+// begin
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_0_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_1_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_2_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_3_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_4_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_5_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_6_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_7_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_8_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_TRUE);
+// end
+
+
+}  // namespace batch_bicgstab
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp
index 6d5e3bff3b3..25ebd667a7e 100644
--- a/hip/solver/batch_cg_kernels.hip.cpp
+++ b/hip/solver/batch_cg_kernels.hip.cpp
@@ -5,18 +5,13 @@
 #include "core/solver/batch_cg_kernels.hpp"
 
 #include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/math.hpp>
 
-#include "common/cuda_hip/base/batch_struct.hpp"
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/math.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
 #include "common/cuda_hip/matrix/batch_struct.hpp"
 #include "common/cuda_hip/solver/batch_cg_kernels.hpp"
 #include "core/base/batch_struct.hpp"
-#include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
+#include "hip/solver/batch_cg_launch.hip.hpp"
 
 
 namespace gko {
@@ -50,47 +45,24 @@ int get_num_threads_per_block(std::shared_ptr<const DefaultExecutor> exec,
 }
 
 
-template <typename T>
-using settings = gko::kernels::batch_cg::settings<T>;
-
-
-template <typename HipValueType>
+template <typename ValueType>
 class kernel_caller {
 public:
-    using value_type = HipValueType;
+    using hip_value_type = hip_type<ValueType>;
 
     kernel_caller(std::shared_ptr<const DefaultExecutor> exec,
-                  const settings<remove_complex<value_type>> settings)
+                  const settings<remove_complex<ValueType>> settings)
         : exec_{exec}, settings_{settings}
     {}
 
-    template <typename StopType, const int n_shared,
-              const bool prec_shared_bool, typename PrecType, typename LogType,
-              typename BatchMatrixType>
-    void launch_apply_kernel(
-        const gko::kernels::batch_cg::storage_config& sconf, LogType& logger,
-        PrecType& prec, const BatchMatrixType& mat,
-        const value_type* const __restrict__ b_values,
-        value_type* const __restrict__ x_values,
-        value_type* const __restrict__ workspace_data, const int& block_size,
-        const size_t& shared_size) const
-    {
-        batch_single_kernels::apply_kernel<StopType, n_shared, prec_shared_bool>
-            <<<mat.num_batch_items, block_size, shared_size,
-               exec_->get_stream()>>>(sconf, settings_.max_iterations,
-                                      settings_.residual_tol, logger, prec, mat,
-                                      b_values, x_values, workspace_data);
-    }
-
-
     template <typename BatchMatrixType, typename PrecType, typename StopType,
               typename LogType>
     void call_kernel(
         LogType logger, const BatchMatrixType& mat, PrecType prec,
-        const gko::batch::multi_vector::uniform_batch<const value_type>& b,
-        const gko::batch::multi_vector::uniform_batch<value_type>& x) const
+        const gko::batch::multi_vector::uniform_batch<const hip_value_type>& b,
+        const gko::batch::multi_vector::uniform_batch<hip_value_type>& x) const
     {
-        using real_type = gko::remove_complex<value_type>;
+        using real_type = gko::remove_complex<hip_value_type>;
         const size_type num_batch_items = mat.num_batch_items;
         constexpr int align_multiple = 8;
         const int padded_num_rows =
@@ -101,79 +73,74 @@ class kernel_caller {
             exec_->get_device_id()));
         const int block_size =
             get_num_threads_per_block<BatchMatrixType>(exec_, mat.num_rows);
-        bool is_block_size_aligned = block_size % config::warp_size == 0;
         GKO_ASSERT(block_size >= 2 * config::warp_size);
-        GKO_ASSERT(is_block_size_aligned);
+        GKO_ASSERT(block_size % config::warp_size == 0);
 
         // Returns amount required in bytes
         const size_t prec_size = PrecType::dynamic_work_size(
             padded_num_rows, mat.get_single_item_num_nnz());
         const auto sconf =
             gko::kernels::batch_cg::compute_shared_storage<PrecType,
-                                                           value_type>(
+                                                           hip_value_type>(
                 shmem_per_blk, padded_num_rows, mat.get_single_item_num_nnz(),
                 b.num_rhs);
         const size_t shared_size =
-            sconf.n_shared * padded_num_rows * sizeof(value_type) +
+            sconf.n_shared * padded_num_rows * sizeof(hip_value_type) +
             (sconf.prec_shared ? prec_size : 0);
-        auto workspace = gko::array<value_type>(
+        auto workspace = gko::array<hip_value_type>(
             exec_,
-            sconf.gmem_stride_bytes * num_batch_items / sizeof(value_type));
-        bool is_stride_aligned =
-            sconf.gmem_stride_bytes % sizeof(value_type) == 0;
-        GKO_ASSERT(is_stride_aligned);
-
-        value_type* const workspace_data = workspace.get_data();
-
-        // Only instantiate when full optimizations has been enabled. Otherwise,
-        // just use the default one with no shared memory.
-        // Template parameters launch_apply_kernel<StopType, n_shared,
-        // prec_shared)
-        // if (sconf.prec_shared) {
-        //     launch_apply_kernel<StopType, 5, true>(
-        //         sconf, logger, prec, mat, b.values, x.values, workspace_data,
-        //         block_size, shared_size);
-        // } else {
-        //     switch (sconf.n_shared) {
-        //     case 0:
-        launch_apply_kernel<StopType, 0, false>(
-            sconf, logger, prec, mat, b.values, x.values, workspace_data,
-            block_size, shared_size);
-        //         break;
-        //     case 1:
-        //         launch_apply_kernel<StopType, 1, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 2:
-        //         launch_apply_kernel<StopType, 2, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 3:
-        //         launch_apply_kernel<StopType, 3, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 4:
-        //         launch_apply_kernel<StopType, 4, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     case 5:
-        //         launch_apply_kernel<StopType, 5, false>(
-        //             sconf, logger, prec, mat, b.values, x.values,
-        //             workspace_data, block_size, shared_size);
-        //         break;
-        //     default:
-        //         GKO_NOT_IMPLEMENTED;
-        //     }
-        // }
+            sconf.gmem_stride_bytes * num_batch_items / sizeof(hip_value_type));
+        GKO_ASSERT(sconf.gmem_stride_bytes % sizeof(hip_value_type) == 0);
+
+        hip_value_type* const workspace_data = workspace.get_data();
+
+        // Template parameters launch_apply_kernel<ValueType, n_shared,
+        // prec_shared, StopType>
+        if (sconf.prec_shared) {
+            launch_apply_kernel<ValueType, 5, true, StopType>(
+                exec_, sconf, settings_, logger, prec, mat, b.values, x.values,
+                workspace_data, block_size, shared_size);
+        } else {
+            switch (sconf.n_shared) {
+            case 0:
+                launch_apply_kernel<ValueType, 0, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 1:
+                launch_apply_kernel<ValueType, 1, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 2:
+                launch_apply_kernel<ValueType, 2, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 3:
+                launch_apply_kernel<ValueType, 3, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 4:
+                launch_apply_kernel<ValueType, 4, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            case 5:
+                launch_apply_kernel<ValueType, 5, false, StopType>(
+                    exec_, sconf, settings_, logger, prec, mat, b.values,
+                    x.values, workspace_data, block_size, shared_size);
+                break;
+            default:
+                GKO_NOT_IMPLEMENTED;
+            }
+        }
     }
 
 private:
     std::shared_ptr<const DefaultExecutor> exec_;
-    const settings<remove_complex<value_type>> settings_;
+    const settings<remove_complex<ValueType>> settings_;
 };
 
 
diff --git a/hip/solver/batch_cg_launch.hip.hpp b/hip/solver/batch_cg_launch.hip.hpp
new file mode 100644
index 00000000000..7071c5c4065
--- /dev/null
+++ b/hip/solver/batch_cg_launch.hip.hpp
@@ -0,0 +1,77 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_HIP_SOLVER_BATCH_CG_LAUNCH_HPP_
+#define GKO_HIP_SOLVER_BATCH_CG_LAUNCH_HPP_
+
+
+#include "common/cuda_hip/base/batch_struct.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/matrix/batch_struct.hpp"
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "core/solver/batch_cg_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+namespace batch_cg {
+
+
+template <typename T>
+using settings = gko::kernels::batch_cg::settings<T>;
+
+
+template <typename ValueType, int n_shared, bool prec_shared, typename StopType,
+          typename PrecType, typename LogType, typename BatchMatrixType>
+void launch_apply_kernel(std::shared_ptr<const DefaultExecutor> exec,
+                         const gko::kernels::batch_cg::storage_config& sconf,
+                         const settings<remove_complex<ValueType>>& settings,
+                         LogType& logger, PrecType& prec,
+                         const BatchMatrixType& mat,
+                         const hip_type<ValueType>* const __restrict__ b_values,
+                         hip_type<ValueType>* const __restrict__ x_values,
+                         hip_type<ValueType>* const __restrict__ workspace_data,
+                         const int& block_size, const size_t& shared_size);
+
+#define GKO_DECLARE_BATCH_CG_LAUNCH(_vtype, _n_shared, _prec_shared, mat_t, \
+                                    log_t, pre_t, stop_t)                   \
+    void launch_apply_kernel<hip_type<_vtype>, _n_shared, _prec_shared,     \
+                             stop_t<hip_type<_vtype>>>(                     \
+        std::shared_ptr<const DefaultExecutor> exec,                        \
+        const gko::kernels::batch_cg::storage_config& sconf,                \
+        const settings<remove_complex<_vtype>>& settings,                   \
+        log_t<hip_type<gko::remove_complex<hip_type<_vtype>>>>& logger,     \
+        pre_t<hip_type<_vtype>>& prec,                                      \
+        const mat_t<const hip_type<_vtype>>& mat,                           \
+        const hip_type<_vtype>* const __restrict__ b_values,                \
+        hip_type<_vtype>* const __restrict__ x_values,                      \
+        hip_type<_vtype>* const __restrict__ workspace_data,                \
+        const int& block_size, const size_t& shared_size)
+
+#define GKO_DECLARE_BATCH_CG_LAUNCH_0_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 0, false)
+#define GKO_DECLARE_BATCH_CG_LAUNCH_1_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 1, false)
+#define GKO_DECLARE_BATCH_CG_LAUNCH_2_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 2, false)
+#define GKO_DECLARE_BATCH_CG_LAUNCH_3_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 3, false)
+#define GKO_DECLARE_BATCH_CG_LAUNCH_4_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 4, false)
+#define GKO_DECLARE_BATCH_CG_LAUNCH_5_FALSE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 5, false)
+#define GKO_DECLARE_BATCH_CG_LAUNCH_5_TRUE(_vtype) \
+    GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 5, true)
+
+
+}  // namespace batch_cg
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif
diff --git a/hip/solver/batch_cg_launch.instantiate.hip.cpp b/hip/solver/batch_cg_launch.instantiate.hip.cpp
new file mode 100644
index 00000000000..3605a88651d
--- /dev/null
+++ b/hip/solver/batch_cg_launch.instantiate.hip.cpp
@@ -0,0 +1,59 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+#include "common/cuda_hip/solver/batch_cg_kernels.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "core/solver/batch_cg_kernels.hpp"
+#include "core/solver/batch_dispatch.hpp"
+#include "hip/solver/batch_cg_launch.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+namespace batch_cg {
+
+
+template <typename ValueType, int n_shared, bool prec_shared, typename StopType,
+          typename PrecType, typename LogType, typename BatchMatrixType>
+void launch_apply_kernel(std::shared_ptr<const DefaultExecutor> exec,
+                         const gko::kernels::batch_cg::storage_config& sconf,
+                         const settings<remove_complex<ValueType>>& settings,
+                         LogType& logger, PrecType& prec,
+                         const BatchMatrixType& mat,
+                         const hip_type<ValueType>* const __restrict__ b_values,
+                         hip_type<ValueType>* const __restrict__ x_values,
+                         hip_type<ValueType>* const __restrict__ workspace_data,
+                         const int& block_size, const size_t& shared_size)
+{
+    batch_single_kernels::apply_kernel<StopType, n_shared, prec_shared>
+        <<<mat.num_batch_items, block_size, shared_size, exec->get_stream()>>>(
+            sconf, settings.max_iterations, as_hip_type(settings.residual_tol),
+            logger, prec, mat, b_values, x_values, workspace_data);
+}
+
+
+// begin
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_0_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_1_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_2_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_3_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_4_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_5_FALSE);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_5_TRUE);
+// end
+
+
+}  // namespace batch_cg
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko