ginkgo-project · upsj · Feb 1, 2022 · Oct 12, 2021 · Oct 12, 2021 · Oct 12, 2021
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
@@ -12,6 +12,12 @@ endfunction()
 
 function(ginkgo_benchmark_cusparse_linops type def)
     add_library(cusparse_linops_${type} utils/cuda_linops.cu)
+    if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA")
+        # remove false positive CUDA warnings when calling one<T>() and zero<T>()
+        target_compile_options(cusparse_linops_${type}
+            PRIVATE
+                $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>)
+    endif()
     # make the dependency public to catch issues
     target_compile_definitions(cusparse_linops_${type} PUBLIC ${def})
     target_link_libraries(cusparse_linops_${type} Ginkgo::ginkgo ${CUDA_RUNTIME_LIBS} ${CUBLAS} ${CUSPARSE})

diff --git a/benchmark/utils/cuda_linops.cu b/benchmark/utils/cuda_linops.cu
@@ -76,12 +76,6 @@ public:
     }
 
 protected:
-    void apply_impl(const gko::LinOp*, const gko::LinOp*, const gko::LinOp*,
-                    gko::LinOp*) const override
-    {
-        GKO_NOT_IMPLEMENTED;
-    }
-
     CusparseBase(std::shared_ptr<const gko::Executor> exec,
                  const gko::dim<2>& size = gko::dim<2>{})
         : gko::LinOp(exec, size)
@@ -143,6 +137,12 @@ class CusparseCsrmp
 public:
     using csr = gko::matrix::Csr<ValueType, IndexType>;
     using mat_data = gko::matrix_data<ValueType, IndexType>;
+    using device_mat_data = gko::device_matrix_data<ValueType, IndexType>;
+
+    void read(const device_mat_data& data) override
+    {
+        this->read(data.copy_to_host());
+    }
 
     void read(const mat_data& data) override
     {
@@ -174,6 +174,10 @@ protected:
             &scalars.get_const_data()[1], dx);
     }
 
+    void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b,
+                    const gko::LinOp* beta,
+                    gko::LinOp* x) const override GKO_NOT_IMPLEMENTED;
+
     CusparseCsrmp(std::shared_ptr<const gko::Executor> exec,
                   const gko::dim<2>& size = gko::dim<2>{})
         : gko::EnableLinOp<CusparseCsrmp, CusparseBase>(exec, size),
@@ -203,6 +207,12 @@ class CusparseCsr
 public:
     using csr = gko::matrix::Csr<ValueType, IndexType>;
     using mat_data = gko::matrix_data<ValueType, IndexType>;
+    using device_mat_data = gko::device_matrix_data<ValueType, IndexType>;
+
+    void read(const device_mat_data& data) override
+    {
+        this->read(data.copy_to_host());
+    }
 
     void read(const mat_data& data) override
     {
@@ -234,6 +244,10 @@ protected:
             &scalars.get_const_data()[1], dx);
     }
 
+    void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b,
+                    const gko::LinOp* beta,
+                    gko::LinOp* x) const override GKO_NOT_IMPLEMENTED;
+
     CusparseCsr(std::shared_ptr<const gko::Executor> exec,
                 const gko::dim<2>& size = gko::dim<2>{})
         : gko::EnableLinOp<CusparseCsr, CusparseBase>(exec, size),
@@ -264,6 +278,12 @@ class CusparseCsrmm
 public:
     using csr = gko::matrix::Csr<ValueType, IndexType>;
     using mat_data = gko::matrix_data<ValueType, IndexType>;
+    using device_mat_data = gko::device_matrix_data<ValueType, IndexType>;
+
+    void read(const device_mat_data& data) override
+    {
+        this->read(data.copy_to_host());
+    }
 
     void read(const mat_data& data) override
     {
@@ -296,6 +316,10 @@ protected:
             dense_x->get_size()[0]);
     }
 
+    void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b,
+                    const gko::LinOp* beta,
+                    gko::LinOp* x) const override GKO_NOT_IMPLEMENTED;
+
     CusparseCsrmm(std::shared_ptr<const gko::Executor> exec,
                   const gko::dim<2>& size = gko::dim<2>{})
         : gko::EnableLinOp<CusparseCsrmm, CusparseBase>(exec, size),
@@ -329,6 +353,12 @@ class CusparseCsrEx
 public:
     using csr = gko::matrix::Csr<ValueType, IndexType>;
     using mat_data = gko::matrix_data<ValueType, IndexType>;
+    using device_mat_data = gko::device_matrix_data<ValueType, IndexType>;
+
+    void read(const device_mat_data& data) override
+    {
+        this->read(data.copy_to_host());
+    }
 
     void read(const mat_data& data) override
     {
@@ -380,6 +410,9 @@ protected:
         // DEVICE for Ginkgo
     }
 
+    void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b,
+                    const gko::LinOp* beta,
+                    gko::LinOp* x) const override GKO_NOT_IMPLEMENTED;
 
     CusparseCsrEx(std::shared_ptr<const gko::Executor> exec,
                   const gko::dim<2>& size = gko::dim<2>{})
@@ -422,6 +455,12 @@ class CusparseHybrid
 public:
     using csr = gko::matrix::Csr<ValueType, IndexType>;
     using mat_data = gko::matrix_data<ValueType, IndexType>;
+    using device_mat_data = gko::device_matrix_data<ValueType, IndexType>;
+
+    void read(const device_mat_data& data) override
+    {
+        this->read(data.copy_to_host());
+    }
 
     void read(const mat_data& data) override
     {
@@ -471,6 +510,10 @@ protected:
             &scalars.get_const_data()[1], dx);
     }
 
+    void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b,
+                    const gko::LinOp* beta,
+                    gko::LinOp* x) const override GKO_NOT_IMPLEMENTED;
+
     CusparseHybrid(std::shared_ptr<const gko::Executor> exec,
                    const gko::dim<2>& size = gko::dim<2>{})
         : gko::EnableLinOp<CusparseHybrid, CusparseBase>(exec, size),
@@ -551,10 +594,16 @@ class CusparseGenericCsr
 public:
     using csr = gko::matrix::Csr<ValueType, IndexType>;
     using mat_data = gko::matrix_data<ValueType, IndexType>;
+    using device_mat_data = gko::device_matrix_data<ValueType, IndexType>;
     cusparseIndexType_t cu_index =
         gko::kernels::cuda::cusparse_index_type<IndexType>();
     cudaDataType_t cu_value = gko::kernels::cuda::cuda_data_type<ValueType>();
 
+    void read(const device_mat_data& data) override
+    {
+        this->read(data.copy_to_host());
+    }
+
     void read(const mat_data& data) override
     {
         using gko::kernels::cuda::as_culibs_type;
@@ -598,6 +647,10 @@ protected:
                               Alg);
     }
 
+    void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b,
+                    const gko::LinOp* beta,
+                    gko::LinOp* x) const override GKO_NOT_IMPLEMENTED;
+
     CusparseGenericCsr(std::shared_ptr<const gko::Executor> exec,
                        const gko::dim<2>& size = gko::dim<2>{})
         : gko::EnableLinOp<CusparseGenericCsr, CusparseBase>(exec, size),
@@ -629,10 +682,16 @@ class CusparseGenericCoo
 public:
     using coo = gko::matrix::Coo<ValueType, IndexType>;
     using mat_data = gko::matrix_data<ValueType, IndexType>;
+    using device_mat_data = gko::device_matrix_data<ValueType, IndexType>;
     cusparseIndexType_t cu_index =
         gko::kernels::cuda::cusparse_index_type<IndexType>();
     cudaDataType_t cu_value = gko::kernels::cuda::cuda_data_type<ValueType>();
 
+    void read(const device_mat_data& data) override
+    {
+        this->read(data.copy_to_host());
+    }
+
     void read(const mat_data& data) override
     {
         using gko::kernels::cuda::as_culibs_type;
@@ -676,6 +735,10 @@ protected:
                               CUSPARSE_MV_ALG_DEFAULT);
     }
 
+    void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b,
+                    const gko::LinOp* beta,
+                    gko::LinOp* x) const override GKO_NOT_IMPLEMENTED;
+
     CusparseGenericCoo(std::shared_ptr<const gko::Executor> exec,
                        const gko::dim<2>& size = gko::dim<2>{})
         : gko::EnableLinOp<CusparseGenericCoo, CusparseBase>(exec, size),

diff --git a/benchmark/utils/hip_linops.hip.cpp b/benchmark/utils/hip_linops.hip.cpp
@@ -65,12 +65,6 @@ class HipsparseBase : public gko::LinOp {
     const gko::HipExecutor* get_gpu_exec() const { return gpu_exec_.get(); }
 
 protected:
-    void apply_impl(const gko::LinOp*, const gko::LinOp*, const gko::LinOp*,
-                    gko::LinOp*) const override
-    {
-        GKO_NOT_IMPLEMENTED;
-    }
-
     HipsparseBase(std::shared_ptr<const gko::Executor> exec,
                   const gko::dim<2>& size = gko::dim<2>{})
         : gko::LinOp(exec, size)
@@ -130,6 +124,12 @@ class HipsparseCsr
 public:
     using csr = gko::matrix::Csr<ValueType, IndexType>;
     using mat_data = gko::matrix_data<ValueType, IndexType>;
+    using device_mat_data = gko::device_matrix_data<ValueType, IndexType>;
+
+    void read(const device_mat_data& data) override
+    {
+        this->read(data.copy_to_host());
+    }
 
     void read(const mat_data& data) override
     {
@@ -161,6 +161,10 @@ class HipsparseCsr
             &scalars.get_const_data()[1], dx);
     }
 
+    void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b,
+                    const gko::LinOp* beta,
+                    gko::LinOp* x) const override GKO_NOT_IMPLEMENTED;
+
     HipsparseCsr(std::shared_ptr<const gko::Executor> exec,
                  const gko::dim<2>& size = gko::dim<2>{})
         : gko::EnableLinOp<HipsparseCsr, HipsparseBase>(exec, size),
@@ -191,6 +195,12 @@ class HipsparseCsrmm
 public:
     using csr = gko::matrix::Csr<ValueType, IndexType>;
     using mat_data = gko::matrix_data<ValueType, IndexType>;
+    using device_mat_data = gko::device_matrix_data<ValueType, IndexType>;
+
+    void read(const device_mat_data& data) override
+    {
+        this->read(data.copy_to_host());
+    }
 
     void read(const mat_data& data) override
     {
@@ -223,6 +233,10 @@ class HipsparseCsrmm
             dense_x->get_size()[0]);
     }
 
+    void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b,
+                    const gko::LinOp* beta,
+                    gko::LinOp* x) const override GKO_NOT_IMPLEMENTED;
+
     HipsparseCsrmm(std::shared_ptr<const gko::Executor> exec,
                    const gko::dim<2>& size = gko::dim<2>{})
         : gko::EnableLinOp<HipsparseCsrmm, HipsparseBase>(exec, size),
@@ -257,6 +271,12 @@ class HipsparseHybrid
 public:
     using csr = gko::matrix::Csr<ValueType, IndexType>;
     using mat_data = gko::matrix_data<ValueType, IndexType>;
+    using device_mat_data = gko::device_matrix_data<ValueType, IndexType>;
+
+    void read(const device_mat_data& data) override
+    {
+        this->read(data.copy_to_host());
+    }
 
     void read(const mat_data& data) override
     {
@@ -306,6 +326,10 @@ class HipsparseHybrid
             &scalars.get_const_data()[1], dx);
     }
 
+    void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b,
+                    const gko::LinOp* beta,
+                    gko::LinOp* x) const override GKO_NOT_IMPLEMENTED;
+
     HipsparseHybrid(std::shared_ptr<const gko::Executor> exec,
                     const gko::dim<2>& size = gko::dim<2>{})
         : gko::EnableLinOp<HipsparseHybrid, HipsparseBase>(exec, size),

diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
@@ -1,8 +1,9 @@
 set(UNIFIED_SOURCES
     base/index_set_kernels.cpp
-    components/device_matrix_data_kernels.cpp
     components/absolute_array_kernels.cpp
+    components/device_matrix_data_kernels.cpp
     components/fill_array_kernels.cpp
+    components/format_conversion_kernels.cpp
     components/precision_conversion_kernels.cpp
     components/reduce_array_kernels.cpp
     distributed/partition_kernels.cpp

diff --git a/common/cuda_hip/components/warp_blas.hpp.inc b/common/cuda_hip/components/warp_blas.hpp.inc
@@ -60,7 +60,7 @@ __device__ __forceinline__ void apply_gauss_jordan_transform(
     ValueType* __restrict__ row, bool& __restrict__ status)
 {
     auto key_col_elem = group.shfl(row[key_col], key_row);
-    if (key_col_elem == zero<ValueType>()) {
+    if (is_zero(key_col_elem)) {
         // TODO: implement error handling for GPUs to be able to properly
         //       report it here
         status = false;
@@ -105,7 +105,7 @@ __device__ __forceinline__ void apply_gauss_jordan_transform_with_rhs(
 {
     auto key_col_elem = group.shfl(row[key_col], key_row);
     auto key_rhs_elem = group.shfl(rhs[0], key_row);
-    if (key_col_elem == zero<ValueType>()) {
+    if (is_zero(key_col_elem)) {
         // TODO: implement error handling for GPUs to be able to properly
         //       report it here
         status = false;

diff --git a/common/cuda_hip/matrix/coo_kernels.hpp.inc b/common/cuda_hip/matrix/coo_kernels.hpp.inc
@@ -218,58 +218,3 @@ __global__ __launch_bounds__(spmv_block_size) void abstract_spmm(
 
 
 }  // namespace
-
-
-namespace kernel {
-
-
-template <typename IndexType>
-__global__ __launch_bounds__(default_block_size) void convert_row_idxs_to_ptrs(
-    const IndexType* __restrict__ idxs, size_type num_nonzeros,
-    IndexType* __restrict__ ptrs, size_type length)
-{
-    const auto tidx = thread::get_thread_id_flat();
-
-    if (tidx == 0) {
-        ptrs[0] = 0;
-        ptrs[length - 1] = num_nonzeros;
-    }
-
-    if (0 < tidx && tidx < num_nonzeros) {
-        if (idxs[tidx - 1] < idxs[tidx]) {
-            for (auto i = idxs[tidx - 1] + 1; i <= idxs[tidx]; i++) {
-                ptrs[i] = tidx;
-            }
-        }
-    }
-}
-
-
-template <typename ValueType>
-__global__ __launch_bounds__(config::max_block_size) void initialize_zero_dense(
-    size_type num_rows, size_type num_cols, size_type stride,
-    ValueType* __restrict__ result)
-{
-    const auto tidx_x = threadIdx.x + blockDim.x * blockIdx.x;
-    const auto tidx_y = threadIdx.y + blockDim.y * blockIdx.y;
-    if (tidx_x < num_cols && tidx_y < num_rows) {
-        result[tidx_y * stride + tidx_x] = zero<ValueType>();
-    }
-}
-
-
-template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void fill_in_dense(
-    size_type nnz, const IndexType* __restrict__ row_idxs,
-    const IndexType* __restrict__ col_idxs,
-    const ValueType* __restrict__ values, size_type stride,
-    ValueType* __restrict__ result)
-{
-    const auto tidx = thread::get_thread_id_flat();
-    if (tidx < nnz) {
-        result[stride * row_idxs[tidx] + col_idxs[tidx]] = values[tidx];
-    }
-}
-
-
-}  // namespace kernel